howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26import cyvcf2
   27import pyBigWig
   28
   29from howard.functions.commons import *
   30from howard.objects.database import *
   31from howard.functions.databases import *
   32from howard.functions.utils import *
   33
   34
   35class Variants:
   36
   37    def __init__(
   38        self,
   39        conn=None,
   40        input: str = None,
   41        output: str = None,
   42        config: dict = {},
   43        param: dict = {},
   44        load: bool = False,
   45    ) -> None:
   46        """
   47        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   48        header
   49
   50        :param conn: the connection to the database
   51        :param input: the input file
   52        :param output: the output file
   53        :param config: a dictionary containing the configuration of the model
   54        :param param: a dictionary containing the parameters of the model
   55        """
   56
   57        # Init variables
   58        self.init_variables()
   59
   60        # Input
   61        self.set_input(input)
   62
   63        # Config
   64        self.set_config(config)
   65
   66        # Param
   67        self.set_param(param)
   68
   69        # Output
   70        self.set_output(output)
   71
   72        # connexion
   73        self.set_connexion(conn)
   74
   75        # Header
   76        self.set_header()
   77
   78        # Samples
   79        self.set_samples()
   80
   81        # Load data
   82        if load:
   83            self.load_data()
   84
   85    def set_samples(self, samples: list = None) -> list:
   86        """
   87        The function `set_samples` sets the samples attribute of an object to a provided list or
   88        retrieves it from a parameter dictionary.
   89
   90        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   91        input and sets the `samples` attribute of the class to the provided list. If no samples are
   92        provided, it tries to get the samples from the class's parameters using the `get_param` method
   93        :type samples: list
   94        :return: The `samples` list is being returned.
   95        """
   96
   97        if not samples:
   98            samples = self.get_param().get("samples", {}).get("list", None)
   99
  100        self.samples = samples
  101
  102        return samples
  103
  104    def get_samples(self) -> list:
  105        """
  106        This function returns a list of samples.
  107        :return: The `get_samples` method is returning the `samples` attribute of the object.
  108        """
  109
  110        return self.samples
  111
  112    def get_samples_check(self) -> bool:
  113        """
  114        This function returns the value of the "check" key within the "samples" dictionary retrieved
  115        from the parameters.
  116        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  117        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  118        method. If the key "check" is not found, it will return `False`.
  119        """
  120
  121        return self.get_param().get("samples", {}).get("check", True)
  122
  123    def set_input(self, input: str = None) -> None:
  124        """
  125        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  126        attributes in the class accordingly.
  127
  128        :param input: The `set_input` method in the provided code snippet is used to set attributes
  129        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  130        :type input: str
  131        """
  132
  133        if input and not isinstance(input, str):
  134            try:
  135                self.input = input.name
  136            except:
  137                log.error(f"Input file '{input} in bad format")
  138                raise ValueError(f"Input file '{input} in bad format")
  139        else:
  140            self.input = input
  141
  142        # Input format
  143        if input:
  144            input_name, input_extension = os.path.splitext(self.input)
  145            self.input_name = input_name
  146            self.input_extension = input_extension
  147            self.input_format = self.input_extension.replace(".", "")
  148
  149    def set_config(self, config: dict) -> None:
  150        """
  151        The set_config function takes a config object and assigns it as the configuration object for the
  152        class.
  153
  154        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  155        contains configuration settings for the class. When you call the `set_config` function with a
  156        dictionary object as the argument, it will set that dictionary as the configuration object for
  157        the class
  158        :type config: dict
  159        """
  160
  161        self.config = config
  162
  163    def set_param(self, param: dict) -> None:
  164        """
  165        This function sets a parameter object for the class based on the input dictionary.
  166
  167        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  168        as the `param` attribute of the class instance
  169        :type param: dict
  170        """
  171
  172        self.param = param
  173
  174    def init_variables(self) -> None:
  175        """
  176        This function initializes the variables that will be used in the rest of the class
  177        """
  178
  179        self.prefix = "howard"
  180        self.table_variants = "variants"
  181        self.dataframe = None
  182
  183        self.comparison_map = {
  184            "gt": ">",
  185            "gte": ">=",
  186            "lt": "<",
  187            "lte": "<=",
  188            "equals": "=",
  189            "contains": "SIMILAR TO",
  190        }
  191
  192        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  193
  194        self.code_type_map_to_sql = {
  195            "Integer": "INTEGER",
  196            "String": "VARCHAR",
  197            "Float": "FLOAT",
  198            "Flag": "VARCHAR",
  199        }
  200
  201        self.index_additionnal_fields = []
  202
  203    def get_indexing(self) -> bool:
  204        """
  205        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  206        returns False.
  207        :return: The value of the indexing parameter.
  208        """
  209
  210        return self.get_param().get("indexing", False)
  211
  212    def get_connexion_config(self) -> dict:
  213        """
  214        The function `get_connexion_config` returns a dictionary containing the configuration for a
  215        connection, including the number of threads and memory limit.
  216        :return: a dictionary containing the configuration for the Connexion library.
  217        """
  218
  219        # config
  220        config = self.get_config()
  221
  222        # Connexion config
  223        connexion_config = {}
  224        threads = self.get_threads()
  225
  226        # Threads
  227        if threads:
  228            connexion_config["threads"] = threads
  229
  230        # Memory
  231        # if config.get("memory", None):
  232        #     connexion_config["memory_limit"] = config.get("memory")
  233        if self.get_memory():
  234            connexion_config["memory_limit"] = self.get_memory()
  235
  236        # Temporary directory
  237        if config.get("tmp", None):
  238            connexion_config["temp_directory"] = config.get("tmp")
  239
  240        # Access
  241        if config.get("access", None):
  242            access = config.get("access")
  243            if access in ["RO"]:
  244                access = "READ_ONLY"
  245            elif access in ["RW"]:
  246                access = "READ_WRITE"
  247            connexion_db = self.get_connexion_db()
  248            if connexion_db in ":memory:":
  249                access = "READ_WRITE"
  250            connexion_config["access_mode"] = access
  251
  252        return connexion_config
  253
  254    def get_duckdb_settings(self) -> dict:
  255        """
  256        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  257        string.
  258        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  259        """
  260
  261        # config
  262        config = self.get_config()
  263
  264        # duckdb settings
  265        duckdb_settings_dict = {}
  266        if config.get("duckdb_settings", None):
  267            duckdb_settings = config.get("duckdb_settings")
  268            duckdb_settings = full_path(duckdb_settings)
  269            # duckdb setting is a file
  270            if os.path.exists(duckdb_settings):
  271                with open(duckdb_settings) as json_file:
  272                    duckdb_settings_dict = yaml.safe_load(json_file)
  273            # duckdb settings is a string
  274            else:
  275                duckdb_settings_dict = json.loads(duckdb_settings)
  276
  277        return duckdb_settings_dict
  278
  279    def set_connexion_db(self) -> str:
  280        """
  281        The function `set_connexion_db` returns the appropriate database connection string based on the
  282        input format and connection type.
  283        :return: the value of the variable `connexion_db`.
  284        """
  285
  286        # Default connexion db
  287        default_connexion_db = ":memory:"
  288
  289        # Find connexion db
  290        if self.get_input_format() in ["db", "duckdb"]:
  291            connexion_db = self.get_input()
  292        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  293            connexion_db = default_connexion_db
  294        elif self.get_connexion_type() in ["tmpfile"]:
  295            tmp_name = tempfile.mkdtemp(
  296                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  297            )
  298            connexion_db = f"{tmp_name}/tmp.db"
  299        elif self.get_connexion_type() != "":
  300            connexion_db = self.get_connexion_type()
  301        else:
  302            connexion_db = default_connexion_db
  303
  304        # Set connexion db
  305        self.connexion_db = connexion_db
  306
  307        return connexion_db
  308
  309    def set_connexion(self, conn) -> None:
  310        """
  311        The function `set_connexion` creates a connection to a database, with options for different
  312        database formats and settings.
  313
  314        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  315        database. If a connection is not provided, a new connection to an in-memory database is created.
  316        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  317        sqlite
  318        """
  319
  320        # Connexion db
  321        connexion_db = self.set_connexion_db()
  322
  323        # Connexion config
  324        connexion_config = self.get_connexion_config()
  325
  326        # Connexion format
  327        connexion_format = self.get_config().get("connexion_format", "duckdb")
  328        # Set connexion format
  329        self.connexion_format = connexion_format
  330
  331        # Connexion
  332        if not conn:
  333            if connexion_format in ["duckdb"]:
  334                conn = duckdb.connect(connexion_db, config=connexion_config)
  335                # duckDB settings
  336                duckdb_settings = self.get_duckdb_settings()
  337                if duckdb_settings:
  338                    for setting in duckdb_settings:
  339                        setting_value = duckdb_settings.get(setting)
  340                        if isinstance(setting_value, str):
  341                            setting_value = f"'{setting_value}'"
  342                        conn.execute(f"PRAGMA {setting}={setting_value};")
  343            elif connexion_format in ["sqlite"]:
  344                conn = sqlite3.connect(connexion_db)
  345
  346        # Set connexion
  347        self.conn = conn
  348
  349        # Log
  350        log.debug(f"connexion_format: {connexion_format}")
  351        log.debug(f"connexion_db: {connexion_db}")
  352        log.debug(f"connexion config: {connexion_config}")
  353        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  354
  355    def set_output(self, output: str = None) -> None:
  356        """
  357        The `set_output` function in Python sets the output file based on the input or a specified key
  358        in the config file, extracting the output name, extension, and format.
  359
  360        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  361        the output file. If the config file has an 'output' key, the method sets the output to the value
  362        of that key. If no output is provided, it sets the output to `None`
  363        :type output: str
  364        """
  365
  366        if output and not isinstance(output, str):
  367            self.output = output.name
  368        else:
  369            self.output = output
  370
  371        # Output format
  372        if self.output:
  373            output_name, output_extension = os.path.splitext(self.output)
  374            self.output_name = output_name
  375            self.output_extension = output_extension
  376            self.output_format = self.output_extension.replace(".", "")
  377        else:
  378            self.output_name = None
  379            self.output_extension = None
  380            self.output_format = None
  381
  382    def set_header(self) -> None:
  383        """
  384        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  385        """
  386
  387        input_file = self.get_input()
  388        default_header_list = [
  389            "##fileformat=VCFv4.2",
  390            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  391        ]
  392
  393        # Full path
  394        input_file = full_path(input_file)
  395
  396        if input_file:
  397
  398            input_format = self.get_input_format()
  399            input_compressed = self.get_input_compressed()
  400            config = self.get_config()
  401            header_list = default_header_list
  402            if input_format in [
  403                "vcf",
  404                "hdr",
  405                "tsv",
  406                "csv",
  407                "psv",
  408                "parquet",
  409                "db",
  410                "duckdb",
  411            ]:
  412                # header provided in param
  413                if config.get("header_file", None):
  414                    with open(config.get("header_file"), "rt") as f:
  415                        header_list = self.read_vcf_header(f)
  416                # within a vcf file format (header within input file itsself)
  417                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  418                    # within a compressed vcf file format (.vcf.gz)
  419                    if input_compressed:
  420                        with bgzf.open(input_file, "rt") as f:
  421                            header_list = self.read_vcf_header(f)
  422                    # within an uncompressed vcf file format (.vcf)
  423                    else:
  424                        with open(input_file, "rt") as f:
  425                            header_list = self.read_vcf_header(f)
  426                # header provided in default external file .hdr
  427                elif os.path.exists((input_file + ".hdr")):
  428                    with open(input_file + ".hdr", "rt") as f:
  429                        header_list = self.read_vcf_header(f)
  430                else:
  431                    try:  # Try to get header info fields and file columns
  432
  433                        with tempfile.TemporaryDirectory() as tmpdir:
  434
  435                            # Create database
  436                            db_for_header = Database(database=input_file)
  437
  438                            # Get header columns for infos fields
  439                            db_header_from_columns = (
  440                                db_for_header.get_header_from_columns()
  441                            )
  442
  443                            # Get real columns in the file
  444                            db_header_columns = db_for_header.get_columns()
  445
  446                            # Write header file
  447                            header_file_tmp = os.path.join(tmpdir, "header")
  448                            f = open(header_file_tmp, "w")
  449                            vcf.Writer(f, db_header_from_columns)
  450                            f.close()
  451
  452                            # Replace #CHROM line with rel columns
  453                            header_list = db_for_header.read_header_file(
  454                                header_file=header_file_tmp
  455                            )
  456                            header_list[-1] = "\t".join(db_header_columns)
  457
  458                    except:
  459
  460                        log.warning(
  461                            f"No header for file {input_file}. Set as default VCF header"
  462                        )
  463                        header_list = default_header_list
  464
  465            else:  # try for unknown format ?
  466
  467                log.error(f"Input file format '{input_format}' not available")
  468                raise ValueError(f"Input file format '{input_format}' not available")
  469
  470            if not header_list:
  471                header_list = default_header_list
  472
  473            # header as list
  474            self.header_list = header_list
  475
  476            # header as VCF object
  477            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  478
  479        else:
  480
  481            self.header_list = None
  482            self.header_vcf = None
  483
  484    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  485        """
  486        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  487        DataFrame based on the connection format.
  488
  489        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  490        represents the SQL query you want to execute. This query will be used to fetch data from a
  491        database and convert it into a pandas DataFrame
  492        :type query: str
  493        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  494        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  495        function will only fetch up to that number of rows from the database query result. If no limit
  496        is specified,
  497        :type limit: int
  498        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  499        """
  500
  501        # Connexion format
  502        connexion_format = self.get_connexion_format()
  503
  504        # Limit in query
  505        if limit:
  506            pd.set_option("display.max_rows", limit)
  507            if connexion_format in ["duckdb"]:
  508                df = (
  509                    self.conn.execute(query)
  510                    .fetch_record_batch(limit)
  511                    .read_next_batch()
  512                    .to_pandas()
  513                )
  514            elif connexion_format in ["sqlite"]:
  515                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  516
  517        # Full query
  518        else:
  519            if connexion_format in ["duckdb"]:
  520                df = self.conn.execute(query).df()
  521            elif connexion_format in ["sqlite"]:
  522                df = pd.read_sql_query(query, self.conn)
  523
  524        return df
  525
  526    def get_overview(self) -> None:
  527        """
  528        The function prints the input, output, config, and dataframe of the current object
  529        """
  530        table_variants_from = self.get_table_variants(clause="from")
  531        sql_columns = self.get_header_columns_as_sql()
  532        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  533        df = self.get_query_to_df(sql_query_export)
  534        log.info(
  535            "Input:  "
  536            + str(self.get_input())
  537            + " ["
  538            + str(str(self.get_input_format()))
  539            + "]"
  540        )
  541        log.info(
  542            "Output: "
  543            + str(self.get_output())
  544            + " ["
  545            + str(str(self.get_output_format()))
  546            + "]"
  547        )
  548        log.info("Config: ")
  549        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  550            "\n"
  551        ):
  552            log.info("\t" + str(d))
  553        log.info("Param: ")
  554        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  555            "\n"
  556        ):
  557            log.info("\t" + str(d))
  558        log.info("Sample list: " + str(self.get_header_sample_list()))
  559        log.info("Dataframe: ")
  560        for d in str(df).split("\n"):
  561            log.info("\t" + str(d))
  562
  563        # garbage collector
  564        del df
  565        gc.collect()
  566
  567        return None
  568
  569    def get_stats(self) -> dict:
  570        """
  571        The `get_stats` function calculates and returns various statistics of the current object,
  572        including information about the input file, variants, samples, header fields, quality, and
  573        SNVs/InDels.
  574        :return: a dictionary containing various statistics of the current object. The dictionary has
  575        the following structure:
  576        """
  577
  578        # Log
  579        log.info(f"Stats Calculation...")
  580
  581        # table varaints
  582        table_variants_from = self.get_table_variants()
  583
  584        # stats dict
  585        stats = {"Infos": {}}
  586
  587        ### File
  588        input_file = self.get_input()
  589        stats["Infos"]["Input file"] = input_file
  590
  591        # Header
  592        header_infos = self.get_header().infos
  593        header_formats = self.get_header().formats
  594        header_infos_list = list(header_infos)
  595        header_formats_list = list(header_formats)
  596
  597        ### Variants
  598
  599        stats["Variants"] = {}
  600
  601        # Variants by chr
  602        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  603        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  604        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  605            by=["CHROM"], kind="quicksort"
  606        )
  607
  608        # Total number of variants
  609        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  610
  611        # Calculate percentage
  612        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  613            lambda x: (x / nb_of_variants)
  614        )
  615
  616        stats["Variants"]["Number of variants by chromosome"] = (
  617            nb_of_variants_by_chrom.to_dict(orient="index")
  618        )
  619
  620        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  621
  622        ### Samples
  623
  624        # Init
  625        samples = {}
  626        nb_of_samples = 0
  627
  628        # Check Samples
  629        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  630            log.debug(f"Check samples...")
  631            for sample in self.get_header_sample_list():
  632                sql_query_samples = f"""
  633                    SELECT  '{sample}' as sample,
  634                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  635                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  636                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  637                    FROM {table_variants_from}
  638                    WHERE (
  639                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  640                        AND
  641                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  642                      )
  643                    GROUP BY genotype
  644                    """
  645                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  646                sample_genotype_count = sql_query_genotype_df["count"].sum()
  647                if len(sql_query_genotype_df):
  648                    nb_of_samples += 1
  649                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  650                        sql_query_genotype_df.to_dict(orient="index")
  651                    )
  652
  653            stats["Samples"] = samples
  654            stats["Infos"]["Number of samples"] = nb_of_samples
  655
  656        # #
  657        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  658        #     stats["Infos"]["Number of samples"] = nb_of_samples
  659        # elif nb_of_samples:
  660        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  661
  662        ### INFO and FORMAT fields
  663        header_types_df = {}
  664        header_types_list = {
  665            "List of INFO fields": header_infos,
  666            "List of FORMAT fields": header_formats,
  667        }
  668        i = 0
  669        for header_type in header_types_list:
  670
  671            header_type_infos = header_types_list.get(header_type)
  672            header_infos_dict = {}
  673
  674            for info in header_type_infos:
  675
  676                i += 1
  677                header_infos_dict[i] = {}
  678
  679                # ID
  680                header_infos_dict[i]["id"] = info
  681
  682                # num
  683                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  684                if header_type_infos[info].num in genotype_map.keys():
  685                    header_infos_dict[i]["Number"] = genotype_map.get(
  686                        header_type_infos[info].num
  687                    )
  688                else:
  689                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  690
  691                # type
  692                if header_type_infos[info].type:
  693                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  694                else:
  695                    header_infos_dict[i]["Type"] = "."
  696
  697                # desc
  698                if header_type_infos[info].desc != None:
  699                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  700                else:
  701                    header_infos_dict[i]["Description"] = ""
  702
  703            if len(header_infos_dict):
  704                header_types_df[header_type] = pd.DataFrame.from_dict(
  705                    header_infos_dict, orient="index"
  706                ).to_dict(orient="index")
  707
  708        # Stats
  709        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  710        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  711        stats["Header"] = header_types_df
  712
  713        ### QUAL
  714        if "QUAL" in self.get_header_columns():
  715            sql_query_qual = f"""
  716                    SELECT
  717                        avg(CAST(QUAL AS INTEGER)) AS Average,
  718                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  719                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  720                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  721                        median(CAST(QUAL AS INTEGER)) AS Median,
  722                        variance(CAST(QUAL AS INTEGER)) AS Variance
  723                    FROM {table_variants_from}
  724                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  725                    """
  726
  727            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  728            stats["Quality"] = {"Stats": qual}
  729
  730        ### SNV and InDel
  731
  732        sql_query_snv = f"""
  733            
  734            SELECT Type, count FROM (
  735
  736                    SELECT
  737                        'Total' AS Type,
  738                        count(*) AS count
  739                    FROM {table_variants_from}
  740
  741                    UNION
  742
  743                    SELECT
  744                        'MNV' AS Type,
  745                        count(*) AS count
  746                    FROM {table_variants_from}
  747                    WHERE len(REF) > 1 AND len(ALT) > 1
  748                    AND len(REF) = len(ALT)
  749
  750                    UNION
  751
  752                    SELECT
  753                        'InDel' AS Type,
  754                        count(*) AS count
  755                    FROM {table_variants_from}
  756                    WHERE len(REF) > 1 OR len(ALT) > 1
  757                    AND len(REF) != len(ALT)
  758                    
  759                    UNION
  760
  761                    SELECT
  762                        'SNV' AS Type,
  763                        count(*) AS count
  764                    FROM {table_variants_from}
  765                    WHERE len(REF) = 1 AND len(ALT) = 1
  766
  767                )
  768
  769            ORDER BY count DESC
  770
  771                """
  772        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  773
  774        sql_query_snv_substitution = f"""
  775                SELECT
  776                    concat(REF, '>', ALT) AS 'Substitution',
  777                    count(*) AS count
  778                FROM {table_variants_from}
  779                WHERE len(REF) = 1 AND len(ALT) = 1
  780                GROUP BY REF, ALT
  781                ORDER BY count(*) DESC
  782                """
  783        snv_substitution = (
  784            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  785        )
  786        stats["Variants"]["Counts"] = snv_indel
  787        stats["Variants"]["Substitutions"] = snv_substitution
  788
  789        return stats
  790
  791    def stats_to_file(self, file: str = None) -> str:
  792        """
  793        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  794        into a JSON object, and writes the JSON object to the specified file.
  795
  796        :param file: The `file` parameter is a string that represents the file path where the JSON data
  797        will be written
  798        :type file: str
  799        :return: the name of the file that was written to.
  800        """
  801
  802        # Get stats
  803        stats = self.get_stats()
  804
  805        # Serializing json
  806        json_object = json.dumps(stats, indent=4)
  807
  808        # Writing to sample.json
  809        with open(file, "w") as outfile:
  810            outfile.write(json_object)
  811
  812        return file
  813
  814    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  815        """
  816        The `print_stats` function generates a markdown file and prints the statistics contained in a
  817        JSON file in a formatted manner.
  818
  819        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  820        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  821        provided, a temporary directory will be created and the stats will be saved in a file named
  822        "stats.md" within that
  823        :type output_file: str
  824        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  825        file where the statistics will be saved. If no value is provided, a temporary directory will be
  826        created and a default file name "stats.json" will be used
  827        :type json_file: str
  828        :return: The function `print_stats` does not return any value. It has a return type annotation
  829        of `None`.
  830        """
  831
  832        # Full path
  833        output_file = full_path(output_file)
  834        json_file = full_path(json_file)
  835
  836        with tempfile.TemporaryDirectory() as tmpdir:
  837
  838            # Files
  839            if not output_file:
  840                output_file = os.path.join(tmpdir, "stats.md")
  841            if not json_file:
  842                json_file = os.path.join(tmpdir, "stats.json")
  843
  844            # Create folders
  845            if not os.path.exists(os.path.dirname(output_file)):
  846                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  847            if not os.path.exists(os.path.dirname(json_file)):
  848                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  849
  850            # Create stats JSON file
  851            stats_file = self.stats_to_file(file=json_file)
  852
  853            # Print stats file
  854            with open(stats_file) as f:
  855                stats = yaml.safe_load(f)
  856
  857            # Output
  858            output_title = []
  859            output_index = []
  860            output = []
  861
  862            # Title
  863            output_title.append("# HOWARD Stats")
  864
  865            # Index
  866            output_index.append("## Index")
  867
  868            # Process sections
  869            for section in stats:
  870                infos = stats.get(section)
  871                section_link = "#" + section.lower().replace(" ", "-")
  872                output.append(f"## {section}")
  873                output_index.append(f"- [{section}]({section_link})")
  874
  875                if len(infos):
  876                    for info in infos:
  877                        try:
  878                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  879                            is_df = True
  880                        except:
  881                            try:
  882                                df = pd.DataFrame.from_dict(
  883                                    json.loads((infos.get(info))), orient="index"
  884                                )
  885                                is_df = True
  886                            except:
  887                                is_df = False
  888                        if is_df:
  889                            output.append(f"### {info}")
  890                            info_link = "#" + info.lower().replace(" ", "-")
  891                            output_index.append(f"   - [{info}]({info_link})")
  892                            output.append(f"{df.to_markdown(index=False)}")
  893                        else:
  894                            output.append(f"- {info}: {infos.get(info)}")
  895                else:
  896                    output.append(f"NA")
  897
  898            # Write stats in markdown file
  899            with open(output_file, "w") as fp:
  900                for item in output_title:
  901                    fp.write("%s\n" % item)
  902                for item in output_index:
  903                    fp.write("%s\n" % item)
  904                for item in output:
  905                    fp.write("%s\n" % item)
  906
  907            # Output stats in markdown
  908            print("")
  909            print("\n\n".join(output_title))
  910            print("")
  911            print("\n\n".join(output))
  912            print("")
  913
  914        return None
  915
  916    def get_input(self) -> str:
  917        """
  918        It returns the value of the input variable.
  919        :return: The input is being returned.
  920        """
  921        return self.input
  922
  923    def get_input_format(self, input_file: str = None) -> str:
  924        """
  925        This function returns the format of the input variable, either from the provided input file or
  926        by prompting for input.
  927
  928        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  929        represents the file path of the input file. If no `input_file` is provided when calling the
  930        method, it will default to `None`
  931        :type input_file: str
  932        :return: The format of the input variable is being returned.
  933        """
  934
  935        if not input_file:
  936            input_file = self.get_input()
  937        input_format = get_file_format(input_file)
  938        return input_format
  939
  940    def get_input_compressed(self, input_file: str = None) -> str:
  941        """
  942        The function `get_input_compressed` returns the format of the input variable after compressing
  943        it.
  944
  945        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  946        that represents the file path of the input file. If no `input_file` is provided when calling the
  947        method, it will default to `None` and the method will then call `self.get_input()` to
  948        :type input_file: str
  949        :return: The function `get_input_compressed` returns the compressed format of the input
  950        variable.
  951        """
  952
  953        if not input_file:
  954            input_file = self.get_input()
  955        input_compressed = get_file_compressed(input_file)
  956        return input_compressed
  957
  958    def get_output(self) -> str:
  959        """
  960        It returns the output of the neuron.
  961        :return: The output of the neural network.
  962        """
  963
  964        return self.output
  965
  966    def get_output_format(self, output_file: str = None) -> str:
  967        """
  968        The function `get_output_format` returns the format of the input variable or the output file if
  969        provided.
  970
  971        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  972        that represents the file path of the output file. If no `output_file` is provided when calling
  973        the method, it will default to the output obtained from the `get_output` method of the class
  974        instance. The
  975        :type output_file: str
  976        :return: The format of the input variable is being returned.
  977        """
  978
  979        if not output_file:
  980            output_file = self.get_output()
  981        output_format = get_file_format(output_file)
  982
  983        return output_format
  984
  985    def get_config(self) -> dict:
  986        """
  987        It returns the config
  988        :return: The config variable is being returned.
  989        """
  990        return self.config
  991
  992    def get_param(self) -> dict:
  993        """
  994        It returns the param
  995        :return: The param variable is being returned.
  996        """
  997        return self.param
  998
  999    def get_connexion_db(self) -> str:
 1000        """
 1001        It returns the connexion_db attribute of the object
 1002        :return: The connexion_db is being returned.
 1003        """
 1004        return self.connexion_db
 1005
 1006    def get_prefix(self) -> str:
 1007        """
 1008        It returns the prefix of the object.
 1009        :return: The prefix is being returned.
 1010        """
 1011        return self.prefix
 1012
 1013    def get_table_variants(self, clause: str = "select") -> str:
 1014        """
 1015        This function returns the table_variants attribute of the object
 1016
 1017        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1018        defaults to select (optional)
 1019        :return: The table_variants attribute of the object.
 1020        """
 1021
 1022        # Access
 1023        access = self.get_config().get("access", None)
 1024
 1025        # Clauses "select", "where", "update"
 1026        if clause in ["select", "where", "update"]:
 1027            table_variants = self.table_variants
 1028        # Clause "from"
 1029        elif clause in ["from"]:
 1030            # For Read Only
 1031            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1032                input_file = self.get_input()
 1033                table_variants = f"'{input_file}' as variants"
 1034            # For Read Write
 1035            else:
 1036                table_variants = f"{self.table_variants} as variants"
 1037        else:
 1038            table_variants = self.table_variants
 1039        return table_variants
 1040
 1041    def get_tmp_dir(self) -> str:
 1042        """
 1043        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1044        parameters or a default path.
 1045        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1046        configuration, parameters, and a default value of "/tmp".
 1047        """
 1048
 1049        return get_tmp(
 1050            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1051        )
 1052
 1053    def get_connexion_type(self) -> str:
 1054        """
 1055        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1056
 1057        :return: The connexion type is being returned.
 1058        """
 1059        return self.get_config().get("connexion_type", "memory")
 1060
 1061    def get_connexion(self):
 1062        """
 1063        It returns the connection object
 1064
 1065        :return: The connection object.
 1066        """
 1067        return self.conn
 1068
 1069    def close_connexion(self) -> None:
 1070        """
 1071        This function closes the connection to the database.
 1072        :return: The connection is being closed.
 1073        """
 1074        return self.conn.close()
 1075
 1076    def get_header(self, type: str = "vcf"):
 1077        """
 1078        This function returns the header of the VCF file as a list of strings
 1079
 1080        :param type: the type of header you want to get, defaults to vcf (optional)
 1081        :return: The header of the vcf file.
 1082        """
 1083
 1084        if self.header_vcf:
 1085            if type == "vcf":
 1086                return self.header_vcf
 1087            elif type == "list":
 1088                return self.header_list
 1089        else:
 1090            if type == "vcf":
 1091                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1092                return header
 1093            elif type == "list":
 1094                return vcf_required
 1095
 1096    def get_header_infos_list(self) -> list:
 1097        """
 1098        This function retrieves a list of information fields from the header.
 1099        :return: A list of information fields from the header.
 1100        """
 1101
 1102        # Init
 1103        infos_list = []
 1104
 1105        for field in self.get_header().infos:
 1106            infos_list.append(field)
 1107
 1108        return infos_list
 1109
 1110    def get_header_length(self, file: str = None) -> int:
 1111        """
 1112        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1113        line.
 1114
 1115        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1116        header file. If this argument is provided, the function will read the header from the specified
 1117        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1118        :type file: str
 1119        :return: the length of the header list, excluding the #CHROM line.
 1120        """
 1121
 1122        if file:
 1123            return len(self.read_vcf_header_file(file=file)) - 1
 1124        elif self.get_header(type="list"):
 1125            return len(self.get_header(type="list")) - 1
 1126        else:
 1127            return 0
 1128
 1129    def get_header_columns(self) -> str:
 1130        """
 1131        This function returns the header list of a VCF
 1132
 1133        :return: The length of the header list.
 1134        """
 1135        if self.get_header():
 1136            return self.get_header(type="list")[-1]
 1137        else:
 1138            return ""
 1139
 1140    def get_header_columns_as_list(self) -> list:
 1141        """
 1142        This function returns the header list of a VCF
 1143
 1144        :return: The length of the header list.
 1145        """
 1146        if self.get_header():
 1147            return self.get_header_columns().strip().split("\t")
 1148        else:
 1149            return []
 1150
 1151    def get_header_columns_as_sql(self) -> str:
 1152        """
 1153        This function retruns header length (without #CHROM line)
 1154
 1155        :return: The length of the header list.
 1156        """
 1157        sql_column_list = []
 1158        for col in self.get_header_columns_as_list():
 1159            sql_column_list.append(f'"{col}"')
 1160        return ",".join(sql_column_list)
 1161
 1162    def get_header_sample_list(
 1163        self, check: bool = False, samples: list = None, samples_force: bool = False
 1164    ) -> list:
 1165        """
 1166        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1167        checking and filtering based on input parameters.
 1168
 1169        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1170        parameter that determines whether to check if the samples in the list are properly defined as
 1171        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1172        list is defined as a, defaults to False
 1173        :type check: bool (optional)
 1174        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1175        allows you to specify a subset of samples from the header. If you provide a list of sample
 1176        names, the function will check if each sample is defined in the header. If a sample is not found
 1177        in the
 1178        :type samples: list
 1179        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1180        a boolean parameter that determines whether to force the function to return the sample list
 1181        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1182        function will return the sample list without performing, defaults to False
 1183        :type samples_force: bool (optional)
 1184        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1185        parameters and conditions specified in the function.
 1186        """
 1187
 1188        # Init
 1189        samples_list = []
 1190
 1191        if samples is None:
 1192            samples_list = self.header_vcf.samples
 1193        else:
 1194            samples_checked = []
 1195            for sample in samples:
 1196                if sample in self.header_vcf.samples:
 1197                    samples_checked.append(sample)
 1198                else:
 1199                    log.warning(f"Sample '{sample}' not defined in header")
 1200            samples_list = samples_checked
 1201
 1202            # Force sample list without checking if is_genotype_column
 1203            if samples_force:
 1204                log.warning(f"Samples {samples_list} not checked if genotypes")
 1205                return samples_list
 1206
 1207        if check:
 1208            samples_checked = []
 1209            for sample in samples_list:
 1210                if self.is_genotype_column(column=sample):
 1211                    samples_checked.append(sample)
 1212                else:
 1213                    log.warning(
 1214                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1215                    )
 1216            samples_list = samples_checked
 1217
 1218        # Return samples list
 1219        return samples_list
 1220
 1221    def is_genotype_column(self, column: str = None) -> bool:
 1222        """
 1223        This function checks if a given column is a genotype column in a database.
 1224
 1225        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1226        represents the column name in a database table. This method checks if the specified column is a
 1227        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1228        method of
 1229        :type column: str
 1230        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1231        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1232        column name and returns the result. If the `column` parameter is None, it returns False.
 1233        """
 1234
 1235        if column is not None:
 1236            return Database(database=self.get_input()).is_genotype_column(column=column)
 1237        else:
 1238            return False
 1239
 1240    def get_verbose(self) -> bool:
 1241        """
 1242        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1243        exist
 1244
 1245        :return: The value of the key "verbose" in the config dictionary.
 1246        """
 1247        return self.get_config().get("verbose", False)
 1248
 1249    def get_connexion_format(self) -> str:
 1250        """
 1251        It returns the connexion format of the object.
 1252        :return: The connexion_format is being returned.
 1253        """
 1254        connexion_format = self.connexion_format
 1255        if connexion_format not in ["duckdb", "sqlite"]:
 1256            log.error(f"Unknown connexion format {connexion_format}")
 1257            raise ValueError(f"Unknown connexion format {connexion_format}")
 1258        else:
 1259            return connexion_format
 1260
 1261    def insert_file_to_table(
 1262        self,
 1263        file,
 1264        columns: str,
 1265        header_len: int = 0,
 1266        sep: str = "\t",
 1267        chunksize: int = 1000000,
 1268    ) -> None:
 1269        """
 1270        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1271        database format.
 1272
 1273        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1274        the path to the file on your system
 1275        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1276        should contain the names of the columns in the table where the data will be inserted. The column
 1277        names should be separated by commas within the string. For example, if you have columns named
 1278        "id", "name
 1279        :type columns: str
 1280        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1281        the number of lines to skip at the beginning of the file before reading the actual data. This
 1282        parameter allows you to skip any header information present in the file before processing the
 1283        data, defaults to 0
 1284        :type header_len: int (optional)
 1285        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1286        separator character that is used in the file being read. In this case, the default separator is
 1287        set to `\t`, which represents a tab character. You can change this parameter to a different
 1288        separator character if, defaults to \t
 1289        :type sep: str (optional)
 1290        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1291        when processing the file in chunks. In the provided code snippet, the default value for
 1292        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1293        to 1000000
 1294        :type chunksize: int (optional)
 1295        """
 1296
 1297        # Config
 1298        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1299        connexion_format = self.get_connexion_format()
 1300
 1301        log.debug("chunksize: " + str(chunksize))
 1302
 1303        if chunksize:
 1304            for chunk in pd.read_csv(
 1305                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1306            ):
 1307                if connexion_format in ["duckdb"]:
 1308                    sql_insert_into = (
 1309                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1310                    )
 1311                    self.conn.execute(sql_insert_into)
 1312                elif connexion_format in ["sqlite"]:
 1313                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1314
 1315    def load_data(
 1316        self,
 1317        input_file: str = None,
 1318        drop_variants_table: bool = False,
 1319        sample_size: int = 20480,
 1320    ) -> None:
 1321        """
 1322        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1323        table before loading the data and specify a sample size.
 1324
 1325        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1326        table
 1327        :type input_file: str
 1328        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1329        determines whether the variants table should be dropped before loading the data. If set to
 1330        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1331        not be dropped, defaults to False
 1332        :type drop_variants_table: bool (optional)
 1333        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1334        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1335        20480
 1336        :type sample_size: int (optional)
 1337        """
 1338
 1339        log.info("Loading...")
 1340
 1341        # change input file
 1342        if input_file:
 1343            self.set_input(input_file)
 1344            self.set_header()
 1345
 1346        # drop variants table
 1347        if drop_variants_table:
 1348            self.drop_variants_table()
 1349
 1350        # get table variants
 1351        table_variants = self.get_table_variants()
 1352
 1353        # Access
 1354        access = self.get_config().get("access", None)
 1355        log.debug(f"access: {access}")
 1356
 1357        # Input format and compress
 1358        input_format = self.get_input_format()
 1359        input_compressed = self.get_input_compressed()
 1360        log.debug(f"input_format: {input_format}")
 1361        log.debug(f"input_compressed: {input_compressed}")
 1362
 1363        # input_compressed_format
 1364        if input_compressed:
 1365            input_compressed_format = "gzip"
 1366        else:
 1367            input_compressed_format = "none"
 1368        log.debug(f"input_compressed_format: {input_compressed_format}")
 1369
 1370        # Connexion format
 1371        connexion_format = self.get_connexion_format()
 1372
 1373        # Sample size
 1374        if not sample_size:
 1375            sample_size = -1
 1376        log.debug(f"sample_size: {sample_size}")
 1377
 1378        # Load data
 1379        log.debug(f"Load Data from {input_format}")
 1380
 1381        # DuckDB connexion
 1382        if connexion_format in ["duckdb"]:
 1383
 1384            # Database already exists
 1385            if self.input_format in ["db", "duckdb"]:
 1386
 1387                if connexion_format in ["duckdb"]:
 1388                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1389                else:
 1390                    log.error(
 1391                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1392                    )
 1393                    raise ValueError(
 1394                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1395                    )
 1396
 1397            # Load from existing database format
 1398            else:
 1399
 1400                try:
 1401                    # Create Table or View
 1402                    database = Database(database=self.input)
 1403                    sql_from = database.get_sql_from(sample_size=sample_size)
 1404
 1405                    if access in ["RO"]:
 1406                        sql_load = (
 1407                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1408                        )
 1409                    else:
 1410                        sql_load = (
 1411                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1412                        )
 1413                    self.conn.execute(sql_load)
 1414
 1415                except:
 1416                    # Format not available
 1417                    log.error(f"Input file format '{self.input_format}' not available")
 1418                    raise ValueError(
 1419                        f"Input file format '{self.input_format}' not available"
 1420                    )
 1421
 1422        # SQLite connexion
 1423        elif connexion_format in ["sqlite"] and input_format in [
 1424            "vcf",
 1425            "tsv",
 1426            "csv",
 1427            "psv",
 1428        ]:
 1429
 1430            # Main structure
 1431            structure = {
 1432                "#CHROM": "VARCHAR",
 1433                "POS": "INTEGER",
 1434                "ID": "VARCHAR",
 1435                "REF": "VARCHAR",
 1436                "ALT": "VARCHAR",
 1437                "QUAL": "VARCHAR",
 1438                "FILTER": "VARCHAR",
 1439                "INFO": "VARCHAR",
 1440            }
 1441
 1442            # Strcuture with samples
 1443            structure_complete = structure
 1444            if self.get_header_sample_list():
 1445                structure["FORMAT"] = "VARCHAR"
 1446                for sample in self.get_header_sample_list():
 1447                    structure_complete[sample] = "VARCHAR"
 1448
 1449            # Columns list for create and insert
 1450            sql_create_table_columns = []
 1451            sql_create_table_columns_list = []
 1452            for column in structure_complete:
 1453                column_type = structure_complete[column]
 1454                sql_create_table_columns.append(
 1455                    f'"{column}" {column_type} default NULL'
 1456                )
 1457                sql_create_table_columns_list.append(f'"{column}"')
 1458
 1459            # Create database
 1460            log.debug(f"Create Table {table_variants}")
 1461            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1462            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1463            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1464            self.conn.execute(sql_create_table)
 1465
 1466            # chunksize define length of file chunk load file
 1467            chunksize = 100000
 1468
 1469            # delimiter
 1470            delimiter = file_format_delimiters.get(input_format, "\t")
 1471
 1472            # Load the input file
 1473            with open(self.input, "rt") as input_file:
 1474
 1475                # Use the appropriate file handler based on the input format
 1476                if input_compressed:
 1477                    input_file = bgzf.open(self.input, "rt")
 1478                if input_format in ["vcf"]:
 1479                    header_len = self.get_header_length()
 1480                else:
 1481                    header_len = 0
 1482
 1483                # Insert the file contents into a table
 1484                self.insert_file_to_table(
 1485                    input_file,
 1486                    columns=sql_create_table_columns_list_sql,
 1487                    header_len=header_len,
 1488                    sep=delimiter,
 1489                    chunksize=chunksize,
 1490                )
 1491
 1492        else:
 1493            log.error(
 1494                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1495            )
 1496            raise ValueError(
 1497                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1498            )
 1499
 1500        # Explode INFOS fields into table fields
 1501        if self.get_explode_infos():
 1502            self.explode_infos(
 1503                prefix=self.get_explode_infos_prefix(),
 1504                fields=self.get_explode_infos_fields(),
 1505                force=True,
 1506            )
 1507
 1508        # Create index after insertion
 1509        self.create_indexes()
 1510
 1511    def get_explode_infos(self) -> bool:
 1512        """
 1513        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1514        to False if it is not set.
 1515        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1516        value. If the parameter is not present, it will return False.
 1517        """
 1518
 1519        return self.get_param().get("explode", {}).get("explode_infos", False)
 1520
 1521    def get_explode_infos_fields(
 1522        self,
 1523        explode_infos_fields: str = None,
 1524        remove_fields_not_in_header: bool = False,
 1525    ) -> list:
 1526        """
 1527        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1528        the input parameter `explode_infos_fields`.
 1529
 1530        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1531        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1532        comma-separated list of field names to explode
 1533        :type explode_infos_fields: str
 1534        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1535        flag that determines whether to remove fields that are not present in the header. If it is set
 1536        to `True`, any field that is not in the header will be excluded from the list of exploded
 1537        information fields. If it is set to `, defaults to False
 1538        :type remove_fields_not_in_header: bool (optional)
 1539        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1540        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1541        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1542        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1543        splitting the string by commas.
 1544        """
 1545
 1546        # If no fields, get it in param
 1547        if not explode_infos_fields:
 1548            explode_infos_fields = (
 1549                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1550            )
 1551
 1552        # If no fields, defined as all fields in header using keyword
 1553        if not explode_infos_fields:
 1554            explode_infos_fields = "*"
 1555
 1556        # If fields list not empty
 1557        if explode_infos_fields:
 1558
 1559            # Input fields list
 1560            if isinstance(explode_infos_fields, str):
 1561                fields_input = explode_infos_fields.split(",")
 1562            elif isinstance(explode_infos_fields, list):
 1563                fields_input = explode_infos_fields
 1564            else:
 1565                fields_input = []
 1566
 1567            # Fields list without * keyword
 1568            fields_without_all = fields_input.copy()
 1569            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1570                fields_without_all.remove("*")
 1571
 1572            # Fields in header
 1573            fields_in_header = sorted(list(set(self.get_header().infos)))
 1574
 1575            # Construct list of fields
 1576            fields_output = []
 1577            for field in fields_input:
 1578
 1579                # Strip field
 1580                field = field.strip()
 1581
 1582                # format keyword * in regex
 1583                if field.upper() in ["*"]:
 1584                    field = ".*"
 1585
 1586                # Find all fields with pattern
 1587                r = re.compile(field)
 1588                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1589
 1590                # Remove fields input from search
 1591                if field in fields_search:
 1592                    fields_search = [field]
 1593                elif fields_search != [field]:
 1594                    fields_search = sorted(
 1595                        list(set(fields_search).difference(fields_input))
 1596                    )
 1597
 1598                # If field is not in header (avoid not well formatted header)
 1599                if not fields_search and not remove_fields_not_in_header:
 1600                    fields_search = [field]
 1601
 1602                # Add found fields
 1603                for new_field in fields_search:
 1604                    # Add field, if not already exists, and if it is in header (if asked)
 1605                    if (
 1606                        new_field not in fields_output
 1607                        and (
 1608                            not remove_fields_not_in_header
 1609                            or new_field in fields_in_header
 1610                        )
 1611                        and new_field not in [".*"]
 1612                    ):
 1613                        fields_output.append(new_field)
 1614
 1615            return fields_output
 1616
 1617        else:
 1618
 1619            return []
 1620
 1621    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1622        """
 1623        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1624        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1625        not provided.
 1626
 1627        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1628        prefix to be used for exploding or expanding information
 1629        :type explode_infos_prefix: str
 1630        :return: the value of the variable `explode_infos_prefix`.
 1631        """
 1632
 1633        if not explode_infos_prefix:
 1634            explode_infos_prefix = (
 1635                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1636            )
 1637
 1638        return explode_infos_prefix
 1639
 1640    def add_column(
 1641        self,
 1642        table_name,
 1643        column_name,
 1644        column_type,
 1645        default_value=None,
 1646        drop: bool = False,
 1647    ) -> dict:
 1648        """
 1649        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1650        doesn't already exist.
 1651
 1652        :param table_name: The name of the table to which you want to add a column
 1653        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1654        to the table
 1655        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1656        want to add to the table. It should be a string that represents the desired data type, such as
 1657        "INTEGER", "TEXT", "REAL", etc
 1658        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1659        default value for the newly added column. If a default value is provided, it will be assigned to
 1660        the column for any existing rows that do not have a value for that column
 1661        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1662        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1663        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1664        to False
 1665        :type drop: bool (optional)
 1666        :return: a boolean value indicating whether the column was successfully added to the table.
 1667        """
 1668
 1669        # added
 1670        added = False
 1671        dropped = False
 1672
 1673        # Check if the column already exists in the table
 1674        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1675        columns = self.get_query_to_df(query).columns.tolist()
 1676        if column_name.upper() in [c.upper() for c in columns]:
 1677            log.debug(
 1678                f"The {column_name} column already exists in the {table_name} table"
 1679            )
 1680            if drop:
 1681                self.drop_column(table_name=table_name, column_name=column_name)
 1682                dropped = True
 1683            else:
 1684                return None
 1685        else:
 1686            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1687
 1688        # Add column in table
 1689        add_column_query = (
 1690            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1691        )
 1692        if default_value is not None:
 1693            add_column_query += f" DEFAULT {default_value}"
 1694        self.execute_query(add_column_query)
 1695        added = not dropped
 1696        log.debug(
 1697            f"The {column_name} column was successfully added to the {table_name} table"
 1698        )
 1699
 1700        if added:
 1701            added_column = {
 1702                "table_name": table_name,
 1703                "column_name": column_name,
 1704                "column_type": column_type,
 1705                "default_value": default_value,
 1706            }
 1707        else:
 1708            added_column = None
 1709
 1710        return added_column
 1711
 1712    def drop_column(
 1713        self, column: dict = None, table_name: str = None, column_name: str = None
 1714    ) -> bool:
 1715        """
 1716        The `drop_column` function drops a specified column from a given table in a database and returns
 1717        True if the column was successfully dropped, and False if the column does not exist in the
 1718        table.
 1719
 1720        :param column: The `column` parameter is a dictionary that contains information about the column
 1721        you want to drop. It has two keys:
 1722        :type column: dict
 1723        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1724        drop a column
 1725        :type table_name: str
 1726        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1727        from the table
 1728        :type column_name: str
 1729        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1730        and False if the column does not exist in the table.
 1731        """
 1732
 1733        # Find column infos
 1734        if column:
 1735            if isinstance(column, dict):
 1736                table_name = column.get("table_name", None)
 1737                column_name = column.get("column_name", None)
 1738            elif isinstance(column, str):
 1739                table_name = self.get_table_variants()
 1740                column_name = column
 1741            else:
 1742                table_name = None
 1743                column_name = None
 1744
 1745        if not table_name and not column_name:
 1746            return False
 1747
 1748        # Removed
 1749        removed = False
 1750
 1751        # Check if the column already exists in the table
 1752        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1753        columns = self.get_query_to_df(query).columns.tolist()
 1754        if column_name in columns:
 1755            log.debug(f"The {column_name} column exists in the {table_name} table")
 1756        else:
 1757            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1758            return False
 1759
 1760        # Add column in table # ALTER TABLE integers DROP k
 1761        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1762        self.execute_query(add_column_query)
 1763        removed = True
 1764        log.debug(
 1765            f"The {column_name} column was successfully dropped to the {table_name} table"
 1766        )
 1767
 1768        return removed
 1769
 1770    def explode_infos(
 1771        self,
 1772        prefix: str = None,
 1773        create_index: bool = False,
 1774        fields: list = None,
 1775        force: bool = False,
 1776        proccess_all_fields_together: bool = False,
 1777        table: str = None,
 1778    ) -> list:
 1779        """
 1780        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1781        individual columns, returning a list of added columns.
 1782
 1783        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1784        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1785        `self.get_explode_infos_prefix()` as the prefix
 1786        :type prefix: str
 1787        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1788        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1789        `False`, indexes will not be created. The default value is `False`, defaults to False
 1790        :type create_index: bool (optional)
 1791        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1792        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1793        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1794        a list to the `
 1795        :type fields: list
 1796        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1797        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1798        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1799        defaults to False
 1800        :type force: bool (optional)
 1801        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1802        flag that determines whether to process all the INFO fields together or individually. If set to
 1803        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1804        be processed individually. The default value is, defaults to False
 1805        :type proccess_all_fields_together: bool (optional)
 1806        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1807        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1808        a value for the `table` parameter, the function will use that table name. If the `table`
 1809        parameter is
 1810        :type table: str
 1811        :return: The `explode_infos` function returns a list of added columns.
 1812        """
 1813
 1814        # drop indexes
 1815        self.drop_indexes()
 1816
 1817        # connexion format
 1818        connexion_format = self.get_connexion_format()
 1819
 1820        # Access
 1821        access = self.get_config().get("access", None)
 1822
 1823        # Added columns
 1824        added_columns = []
 1825
 1826        if access not in ["RO"]:
 1827
 1828            # prefix
 1829            if prefix in [None, True] or not isinstance(prefix, str):
 1830                if self.get_explode_infos_prefix() not in [None, True]:
 1831                    prefix = self.get_explode_infos_prefix()
 1832                else:
 1833                    prefix = "INFO/"
 1834
 1835            # table variants
 1836            if table is not None:
 1837                table_variants = table
 1838            else:
 1839                table_variants = self.get_table_variants(clause="select")
 1840
 1841            # extra infos
 1842            try:
 1843                extra_infos = self.get_extra_infos()
 1844            except:
 1845                extra_infos = []
 1846
 1847            # Header infos
 1848            header_infos = self.get_header().infos
 1849
 1850            log.debug(
 1851                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1852            )
 1853
 1854            sql_info_alter_table_array = []
 1855
 1856            # Info fields to check
 1857            fields_list = list(header_infos)
 1858            if fields:
 1859                fields_list += fields
 1860            fields_list = set(fields_list)
 1861
 1862            # If no fields
 1863            if not fields:
 1864                fields = []
 1865
 1866            # Translate fields if patterns
 1867            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1868
 1869            for info in fields:
 1870
 1871                info_id_sql = prefix + info
 1872
 1873                if (
 1874                    info in fields_list
 1875                    or prefix + info in fields_list
 1876                    or info in extra_infos
 1877                ):
 1878
 1879                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1880
 1881                    if info in header_infos:
 1882                        info_type = header_infos[info].type
 1883                        info_num = header_infos[info].num
 1884                    else:
 1885                        info_type = "String"
 1886                        info_num = 0
 1887
 1888                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1889                    if info_num != 1:
 1890                        type_sql = "VARCHAR"
 1891
 1892                    # Add field
 1893                    added_column = self.add_column(
 1894                        table_name=table_variants,
 1895                        column_name=info_id_sql,
 1896                        column_type=type_sql,
 1897                        default_value="null",
 1898                        drop=force,
 1899                    )
 1900
 1901                    if added_column:
 1902                        added_columns.append(added_column)
 1903
 1904                    if added_column or force:
 1905
 1906                        # add field to index
 1907                        self.index_additionnal_fields.append(info_id_sql)
 1908
 1909                        # Update field array
 1910                        if connexion_format in ["duckdb"]:
 1911                            update_info_field = f"""
 1912                            "{info_id_sql}" =
 1913                                CASE
 1914                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1915                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1916                                END
 1917                            """
 1918                        elif connexion_format in ["sqlite"]:
 1919                            update_info_field = f"""
 1920                                "{info_id_sql}" =
 1921                                    CASE
 1922                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1923                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1924                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1925                                    END
 1926                            """
 1927
 1928                        sql_info_alter_table_array.append(update_info_field)
 1929
 1930            if sql_info_alter_table_array:
 1931
 1932                # By chromosomes
 1933                try:
 1934                    chromosomes_list = list(
 1935                        self.get_query_to_df(
 1936                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1937                        )["#CHROM"]
 1938                    )
 1939                except:
 1940                    chromosomes_list = [None]
 1941
 1942                for chrom in chromosomes_list:
 1943                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1944
 1945                    # Where clause
 1946                    where_clause = ""
 1947                    if chrom and len(chromosomes_list) > 1:
 1948                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1949
 1950                    # Update table
 1951                    if proccess_all_fields_together:
 1952                        sql_info_alter_table_array_join = ", ".join(
 1953                            sql_info_alter_table_array
 1954                        )
 1955                        if sql_info_alter_table_array_join:
 1956                            sql_info_alter_table = f"""
 1957                                UPDATE {table_variants}
 1958                                SET {sql_info_alter_table_array_join}
 1959                                {where_clause}
 1960                                """
 1961                            log.debug(
 1962                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1963                            )
 1964                            # log.debug(sql_info_alter_table)
 1965                            self.conn.execute(sql_info_alter_table)
 1966                    else:
 1967                        sql_info_alter_num = 0
 1968                        for sql_info_alter in sql_info_alter_table_array:
 1969                            sql_info_alter_num += 1
 1970                            sql_info_alter_table = f"""
 1971                                UPDATE {table_variants}
 1972                                SET {sql_info_alter}
 1973                                {where_clause}
 1974                                """
 1975                            log.debug(
 1976                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1977                            )
 1978                            # log.debug(sql_info_alter_table)
 1979                            self.conn.execute(sql_info_alter_table)
 1980
 1981        # create indexes
 1982        if create_index:
 1983            self.create_indexes()
 1984
 1985        return added_columns
 1986
 1987    def create_indexes(self) -> None:
 1988        """
 1989        Create indexes on the table after insertion
 1990        """
 1991
 1992        # Access
 1993        access = self.get_config().get("access", None)
 1994
 1995        # get table variants
 1996        table_variants = self.get_table_variants("FROM")
 1997
 1998        if self.get_indexing() and access not in ["RO"]:
 1999            # Create index
 2000            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2001            self.conn.execute(sql_create_table_index)
 2002            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2003            self.conn.execute(sql_create_table_index)
 2004            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2005            self.conn.execute(sql_create_table_index)
 2006            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2007            self.conn.execute(sql_create_table_index)
 2008            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2009            self.conn.execute(sql_create_table_index)
 2010            for field in self.index_additionnal_fields:
 2011                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2012                self.conn.execute(sql_create_table_index)
 2013
 2014    def drop_indexes(self) -> None:
 2015        """
 2016        Create indexes on the table after insertion
 2017        """
 2018
 2019        # Access
 2020        access = self.get_config().get("access", None)
 2021
 2022        # get table variants
 2023        table_variants = self.get_table_variants("FROM")
 2024
 2025        # Get database format
 2026        connexion_format = self.get_connexion_format()
 2027
 2028        if access not in ["RO"]:
 2029            if connexion_format in ["duckdb"]:
 2030                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2031            elif connexion_format in ["sqlite"]:
 2032                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2033
 2034            list_indexes = self.conn.execute(sql_list_indexes)
 2035            index_names = [row[0] for row in list_indexes.fetchall()]
 2036            for index in index_names:
 2037                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2038                self.conn.execute(sql_drop_table_index)
 2039
 2040    def read_vcf_header(self, f) -> list:
 2041        """
 2042        It reads the header of a VCF file and returns a list of the header lines
 2043
 2044        :param f: the file object
 2045        :return: The header lines of the VCF file.
 2046        """
 2047
 2048        header_list = []
 2049        for line in f:
 2050            header_list.append(line)
 2051            if line.startswith("#CHROM"):
 2052                break
 2053        return header_list
 2054
 2055    def read_vcf_header_file(self, file: str = None) -> list:
 2056        """
 2057        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2058        uncompressed files.
 2059
 2060        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2061        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2062        default to `None`
 2063        :type file: str
 2064        :return: The function `read_vcf_header_file` returns a list.
 2065        """
 2066
 2067        if self.get_input_compressed(input_file=file):
 2068            with bgzf.open(file, "rt") as f:
 2069                return self.read_vcf_header(f=f)
 2070        else:
 2071            with open(file, "rt") as f:
 2072                return self.read_vcf_header(f=f)
 2073
 2074    def execute_query(self, query: str):
 2075        """
 2076        It takes a query as an argument, executes it, and returns the results
 2077
 2078        :param query: The query to be executed
 2079        :return: The result of the query is being returned.
 2080        """
 2081        if query:
 2082            return self.conn.execute(query)  # .fetchall()
 2083        else:
 2084            return None
 2085
 2086    def export_output(
 2087        self,
 2088        output_file: str | None = None,
 2089        output_header: str | None = None,
 2090        export_header: bool = True,
 2091        query: str | None = None,
 2092        parquet_partitions: list | None = None,
 2093        chunk_size: int | None = None,
 2094        threads: int | None = None,
 2095        sort: bool = False,
 2096        index: bool = False,
 2097        order_by: str | None = None,
 2098    ) -> bool:
 2099        """
 2100        The `export_output` function exports data from a VCF file to a specified output file in various
 2101        formats, including VCF, CSV, TSV, PSV, and Parquet.
 2102
 2103        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2104        output file to be generated by the function. This is where the exported data will be saved
 2105        :type output_file: str
 2106        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2107        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2108        header will be exported to a file with the same name as the `output_file` parameter, but with
 2109        the extension "
 2110        :type output_header: str
 2111        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2112        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2113        True, the header will be exported to a file. If `export_header` is False, the header will not
 2114        be, defaults to True, if output format is not VCF
 2115        :type export_header: bool (optional)
 2116        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 2117        select specific data from the VCF file before exporting it. If provided, only the data that
 2118        matches the query will be exported
 2119        :type query: str
 2120        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2121        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2122        organize data in a hierarchical directory structure based on the values of one or more columns.
 2123        This can improve query performance when working with large datasets
 2124        :type parquet_partitions: list
 2125        :param chunk_size: The `chunk_size` parameter specifies the number of
 2126        records in batch when exporting data in Parquet format. This parameter is used for
 2127        partitioning the Parquet file into multiple files.
 2128        :type chunk_size: int
 2129        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2130        threads to be used during the export process. It determines the level of parallelism and can
 2131        improve the performance of the export operation. If not provided, the function will use the
 2132        default number of threads
 2133        :type threads: int
 2134        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2135        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2136        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2137        False
 2138        :type sort: bool (optional)
 2139        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2140        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2141        no index will be created. The default value is False, defaults to False
 2142        :type index: bool (optional)
 2143        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2144        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2145        :type order_by: str
 2146        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2147        None if it doesn't.
 2148        """
 2149
 2150        # Log
 2151        log.info("Exporting...")
 2152
 2153        # Full path
 2154        output_file = full_path(output_file)
 2155        output_header = full_path(output_header)
 2156
 2157        # Config
 2158        config = self.get_config()
 2159
 2160        # Param
 2161        param = self.get_param()
 2162
 2163        # Tmp files to remove
 2164        tmp_to_remove = []
 2165
 2166        # If no output, get it
 2167        if not output_file:
 2168            output_file = self.get_output()
 2169
 2170        # If not threads
 2171        if not threads:
 2172            threads = self.get_threads()
 2173
 2174        # Auto header name with extension
 2175        if export_header or output_header:
 2176            if not output_header:
 2177                output_header = f"{output_file}.hdr"
 2178            # Export header
 2179            self.export_header(output_file=output_file)
 2180
 2181        # Switch off export header if VCF output
 2182        output_file_type = get_file_format(output_file)
 2183        if output_file_type in ["vcf"]:
 2184            export_header = False
 2185            tmp_to_remove.append(output_header)
 2186
 2187        # Chunk size
 2188        if not chunk_size:
 2189            chunk_size = config.get("chunk_size", None)
 2190
 2191        # Parquet partition
 2192        if not parquet_partitions:
 2193            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2194        if parquet_partitions and isinstance(parquet_partitions, str):
 2195            parquet_partitions = parquet_partitions.split(",")
 2196
 2197        # Order by
 2198        if not order_by:
 2199            order_by = param.get("export", {}).get("order_by", "")
 2200
 2201        # Header in output
 2202        header_in_output = param.get("export", {}).get("include_header", False)
 2203
 2204        # Database
 2205        database_source = self.get_connexion()
 2206
 2207        # Connexion format
 2208        connexion_format = self.get_connexion_format()
 2209
 2210        # Explode infos
 2211        if self.get_explode_infos():
 2212            self.explode_infos(
 2213                prefix=self.get_explode_infos_prefix(),
 2214                fields=self.get_explode_infos_fields(),
 2215                force=False,
 2216            )
 2217
 2218        # if connexion_format in ["sqlite"] or query:
 2219        if connexion_format in ["sqlite"]:
 2220
 2221            # Export in Parquet
 2222            random_tmp = "".join(
 2223                random.choice(string.ascii_lowercase) for i in range(10)
 2224            )
 2225            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2226            tmp_to_remove.append(database_source)
 2227
 2228            # Table Variants
 2229            table_variants = self.get_table_variants()
 2230
 2231            # Create export query
 2232            sql_query_export_subquery = f"""
 2233                SELECT * FROM {table_variants}
 2234                """
 2235
 2236            # Write source file
 2237            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2238
 2239        # Create database
 2240        database = Database(
 2241            database=database_source,
 2242            table="variants",
 2243            header_file=output_header,
 2244            conn_config=self.get_connexion_config(),
 2245        )
 2246
 2247        # Existing colomns header
 2248        existing_columns_header = database.get_header_columns_from_database(query=query)
 2249
 2250        # Sample list
 2251        if output_file_type in ["vcf"]:
 2252            get_samples = self.get_samples()
 2253            get_samples_check = self.get_samples_check()
 2254            samples_force = get_samples is not None
 2255            sample_list = self.get_header_sample_list(
 2256                check=get_samples_check,
 2257                samples=get_samples,
 2258                samples_force=samples_force,
 2259            )
 2260        else:
 2261            sample_list = None
 2262
 2263        # Export file
 2264        database.export(
 2265            output_database=output_file,
 2266            output_header=output_header,
 2267            existing_columns_header=existing_columns_header,
 2268            parquet_partitions=parquet_partitions,
 2269            chunk_size=chunk_size,
 2270            threads=threads,
 2271            sort=sort,
 2272            index=index,
 2273            header_in_output=header_in_output,
 2274            order_by=order_by,
 2275            query=query,
 2276            export_header=export_header,
 2277            sample_list=sample_list,
 2278        )
 2279
 2280        # Remove
 2281        remove_if_exists(tmp_to_remove)
 2282
 2283        return (os.path.exists(output_file) or None) and (
 2284            os.path.exists(output_file) or None
 2285        )
 2286
 2287    def get_extra_infos(self, table: str = None) -> list:
 2288        """
 2289        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2290        in the header.
 2291
 2292        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2293        name of the table from which you want to retrieve the extra columns that are not present in the
 2294        header. If the `table` parameter is not provided when calling the function, it will default to
 2295        using the variants
 2296        :type table: str
 2297        :return: A list of columns that are in the specified table but not in the header of the table.
 2298        """
 2299
 2300        header_columns = []
 2301
 2302        if not table:
 2303            table = self.get_table_variants(clause="from")
 2304            header_columns = self.get_header_columns()
 2305
 2306        # Check all columns in the database
 2307        query = f""" SELECT * FROM {table} LIMIT 1 """
 2308        log.debug(f"query {query}")
 2309        table_columns = self.get_query_to_df(query).columns.tolist()
 2310        extra_columns = []
 2311
 2312        # Construct extra infos (not in header)
 2313        for column in table_columns:
 2314            if column not in header_columns:
 2315                extra_columns.append(column)
 2316
 2317        return extra_columns
 2318
 2319    def get_extra_infos_sql(self, table: str = None) -> str:
 2320        """
 2321        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2322        by double quotes
 2323
 2324        :param table: The name of the table to get the extra infos from. If None, the default table is
 2325        used
 2326        :type table: str
 2327        :return: A string of the extra infos
 2328        """
 2329
 2330        return ", ".join(
 2331            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2332        )
 2333
 2334    def export_header(
 2335        self,
 2336        header_name: str = None,
 2337        output_file: str = None,
 2338        output_file_ext: str = ".hdr",
 2339        clean_header: bool = True,
 2340        remove_chrom_line: bool = False,
 2341    ) -> str:
 2342        """
 2343        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2344        specified options, and writes it to a new file.
 2345
 2346        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2347        this parameter is not specified, the header will be written to the output file
 2348        :type header_name: str
 2349        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2350        specify the name of the output file where the header will be written. If this parameter is not
 2351        provided, the header will be written to a temporary file
 2352        :type output_file: str
 2353        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2354        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2355        if not specified by the user. This extension will be appended to the `output_file` name to
 2356        create the final, defaults to .hdr
 2357        :type output_file_ext: str (optional)
 2358        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2359        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2360        `True`, the function will clean the header by modifying certain lines based on a specific
 2361        pattern. If `clean_header`, defaults to True
 2362        :type clean_header: bool (optional)
 2363        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2364        boolean flag that determines whether the #CHROM line should be removed from the header before
 2365        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2366        defaults to False
 2367        :type remove_chrom_line: bool (optional)
 2368        :return: The function `export_header` returns the name of the temporary header file that is
 2369        created.
 2370        """
 2371
 2372        if not header_name and not output_file:
 2373            output_file = self.get_output()
 2374
 2375        if self.get_header():
 2376
 2377            # Get header object
 2378            header_obj = self.get_header()
 2379
 2380            # Create database
 2381            db_for_header = Database(database=self.get_input())
 2382
 2383            # Get real columns in the file
 2384            db_header_columns = db_for_header.get_columns()
 2385
 2386            with tempfile.TemporaryDirectory() as tmpdir:
 2387
 2388                # Write header file
 2389                header_file_tmp = os.path.join(tmpdir, "header")
 2390                f = open(header_file_tmp, "w")
 2391                vcf.Writer(f, header_obj)
 2392                f.close()
 2393
 2394                # Replace #CHROM line with rel columns
 2395                header_list = db_for_header.read_header_file(
 2396                    header_file=header_file_tmp
 2397                )
 2398                header_list[-1] = "\t".join(db_header_columns)
 2399
 2400                # Remove CHROM line
 2401                if remove_chrom_line:
 2402                    header_list.pop()
 2403
 2404                # Clean header
 2405                if clean_header:
 2406                    header_list_clean = []
 2407                    for head in header_list:
 2408                        # Clean head for malformed header
 2409                        head_clean = head
 2410                        head_clean = re.subn(
 2411                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2412                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2413                            head_clean,
 2414                            2,
 2415                        )[0]
 2416                        # Write header
 2417                        header_list_clean.append(head_clean)
 2418                    header_list = header_list_clean
 2419
 2420            tmp_header_name = output_file + output_file_ext
 2421
 2422            f = open(tmp_header_name, "w")
 2423            for line in header_list:
 2424                f.write(line)
 2425            f.close()
 2426
 2427        return tmp_header_name
 2428
 2429    def export_variant_vcf(
 2430        self,
 2431        vcf_file,
 2432        remove_info: bool = False,
 2433        add_samples: bool = True,
 2434        list_samples: list = [],
 2435        where_clause: str = "",
 2436        index: bool = False,
 2437        threads: int | None = None,
 2438    ) -> bool | None:
 2439        """
 2440        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2441        remove INFO field, add samples, and control compression and indexing.
 2442
 2443        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2444        written to. It is the output file that will contain the filtered VCF data based on the specified
 2445        parameters
 2446        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2447        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2448        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2449        in, defaults to False
 2450        :type remove_info: bool (optional)
 2451        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2452        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2453        If set to False, the samples will be removed. The default value is True, defaults to True
 2454        :type add_samples: bool (optional)
 2455        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2456        in the output VCF file. By default, all samples will be included. If you provide a list of
 2457        samples, only those samples will be included in the output file
 2458        :type list_samples: list
 2459        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2460        determines whether or not to create an index for the output VCF file. If `index` is set to
 2461        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2462        :type index: bool (optional)
 2463        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2464        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2465        will be used during the export process. More threads can potentially speed up the export process
 2466        by utilizing multiple cores of the processor. If
 2467        :type threads: int | None
 2468        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2469        method with various parameters including the output file, query, threads, sort flag, and index
 2470        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2471        specified parameters and configurations provided in the `export_variant_vcf` function.
 2472        """
 2473
 2474        # Config
 2475        config = self.get_config()
 2476
 2477        # Extract VCF
 2478        log.debug("Export VCF...")
 2479
 2480        # Table variants
 2481        table_variants = self.get_table_variants()
 2482
 2483        # Threads
 2484        if not threads:
 2485            threads = self.get_threads()
 2486
 2487        # Info fields
 2488        if remove_info:
 2489            if not isinstance(remove_info, str):
 2490                remove_info = "."
 2491            info_field = f"""'{remove_info}' as INFO"""
 2492        else:
 2493            info_field = "INFO"
 2494
 2495        # Samples fields
 2496        if add_samples:
 2497            if not list_samples:
 2498                list_samples = self.get_header_sample_list()
 2499            if list_samples:
 2500                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2501            else:
 2502                samples_fields = ""
 2503            log.debug(f"samples_fields: {samples_fields}")
 2504        else:
 2505            samples_fields = ""
 2506
 2507        # Where clause
 2508        if where_clause is None:
 2509            where_clause = ""
 2510
 2511        # Variants
 2512        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2513        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2514        log.debug(f"sql_query_select={sql_query_select}")
 2515
 2516        return self.export_output(
 2517            output_file=vcf_file,
 2518            output_header=None,
 2519            export_header=True,
 2520            query=sql_query_select,
 2521            parquet_partitions=None,
 2522            chunk_size=config.get("chunk_size", None),
 2523            threads=threads,
 2524            sort=True,
 2525            index=index,
 2526            order_by=None,
 2527        )
 2528
 2529    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2530        """
 2531        It takes a list of commands and runs them in parallel using the number of threads specified
 2532
 2533        :param commands: A list of commands to run
 2534        :param threads: The number of threads to use, defaults to 1 (optional)
 2535        """
 2536
 2537        run_parallel_commands(commands, threads)
 2538
 2539    def get_threads(self, default: int = 1) -> int:
 2540        """
 2541        This function returns the number of threads to use for a job, with a default value of 1 if not
 2542        specified.
 2543
 2544        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2545        default number of threads to use if no specific value is provided. If no value is provided for
 2546        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2547        used, defaults to 1
 2548        :type default: int (optional)
 2549        :return: the number of threads to use for the current job.
 2550        """
 2551
 2552        # Config
 2553        config = self.get_config()
 2554
 2555        # Param
 2556        param = self.get_param()
 2557
 2558        # Input threads
 2559        input_thread = param.get("threads", config.get("threads", None))
 2560
 2561        # Check threads
 2562        if not input_thread:
 2563            threads = default
 2564        elif int(input_thread) <= 0:
 2565            threads = os.cpu_count()
 2566        else:
 2567            threads = int(input_thread)
 2568        return threads
 2569
 2570    def get_memory(self, default: str = None) -> str:
 2571        """
 2572        This function retrieves the memory value from parameters or configuration with a default value
 2573        if not found.
 2574
 2575        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2576        default value is used as a fallback in case the `memory` parameter is not provided in the
 2577        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2578        the function
 2579        :type default: str
 2580        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2581        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2582        return the default value provided as an argument to the function.
 2583        """
 2584
 2585        # Config
 2586        config = self.get_config()
 2587
 2588        # Param
 2589        param = self.get_param()
 2590
 2591        # Input threads
 2592        input_memory = param.get("memory", config.get("memory", None))
 2593
 2594        # Check threads
 2595        if input_memory:
 2596            memory = input_memory
 2597        else:
 2598            memory = default
 2599
 2600        return memory
 2601
 2602    def update_from_vcf(self, vcf_file: str) -> None:
 2603        """
 2604        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2605
 2606        :param vcf_file: the path to the VCF file
 2607        """
 2608
 2609        connexion_format = self.get_connexion_format()
 2610
 2611        if connexion_format in ["duckdb"]:
 2612            self.update_from_vcf_duckdb(vcf_file)
 2613        elif connexion_format in ["sqlite"]:
 2614            self.update_from_vcf_sqlite(vcf_file)
 2615
 2616    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2617        """
 2618        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2619        INFO column of the VCF file
 2620
 2621        :param vcf_file: the path to the VCF file
 2622        """
 2623
 2624        # varaints table
 2625        table_variants = self.get_table_variants()
 2626
 2627        # Loading VCF into temporaire table
 2628        skip = self.get_header_length(file=vcf_file)
 2629        vcf_df = pd.read_csv(
 2630            vcf_file,
 2631            sep="\t",
 2632            engine="c",
 2633            skiprows=skip,
 2634            header=0,
 2635            low_memory=False,
 2636        )
 2637        sql_query_update = f"""
 2638        UPDATE {table_variants} as table_variants
 2639            SET INFO = concat(
 2640                            CASE
 2641                                WHEN INFO NOT IN ('', '.')
 2642                                THEN INFO
 2643                                ELSE ''
 2644                            END,
 2645                            (
 2646                                SELECT 
 2647                                    concat(
 2648                                        CASE
 2649                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2650                                            THEN ';'
 2651                                            ELSE ''
 2652                                        END
 2653                                        ,
 2654                                        CASE
 2655                                            WHEN table_parquet.INFO NOT IN ('','.')
 2656                                            THEN table_parquet.INFO
 2657                                            ELSE ''
 2658                                        END
 2659                                    )
 2660                                FROM vcf_df as table_parquet
 2661                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2662                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2663                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2664                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2665                                        AND table_parquet.INFO NOT IN ('','.')
 2666                            )
 2667                        )
 2668            ;
 2669            """
 2670        self.conn.execute(sql_query_update)
 2671
 2672    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2673        """
 2674        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2675        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2676        table
 2677
 2678        :param vcf_file: The path to the VCF file you want to update the database with
 2679        """
 2680
 2681        # Create a temporary table for the VCF
 2682        table_vcf = "tmp_vcf"
 2683        sql_create = (
 2684            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2685        )
 2686        self.conn.execute(sql_create)
 2687
 2688        # Loading VCF into temporaire table
 2689        vcf_df = pd.read_csv(
 2690            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2691        )
 2692        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2693        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2694
 2695        # Update table 'variants' with VCF data
 2696        # warning: CONCAT as || operator
 2697        sql_query_update = f"""
 2698            UPDATE variants as table_variants
 2699            SET INFO = CASE
 2700                            WHEN INFO NOT IN ('', '.')
 2701                            THEN INFO
 2702                            ELSE ''
 2703                        END ||
 2704                        (
 2705                        SELECT 
 2706                            CASE 
 2707                                WHEN table_variants.INFO NOT IN ('','.') 
 2708                                    AND table_vcf.INFO NOT IN ('','.')  
 2709                                THEN ';' 
 2710                                ELSE '' 
 2711                            END || 
 2712                            CASE 
 2713                                WHEN table_vcf.INFO NOT IN ('','.') 
 2714                                THEN table_vcf.INFO 
 2715                                ELSE '' 
 2716                            END
 2717                        FROM {table_vcf} as table_vcf
 2718                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2719                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2720                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2721                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2722                        )
 2723        """
 2724        self.conn.execute(sql_query_update)
 2725
 2726        # Drop temporary table
 2727        sql_drop = f"DROP TABLE {table_vcf}"
 2728        self.conn.execute(sql_drop)
 2729
 2730    def drop_variants_table(self) -> None:
 2731        """
 2732        > This function drops the variants table
 2733        """
 2734
 2735        table_variants = self.get_table_variants()
 2736        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2737        self.conn.execute(sql_table_variants)
 2738
 2739    def set_variant_id(
 2740        self, variant_id_column: str = "variant_id", force: bool = None
 2741    ) -> str:
 2742        """
 2743        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2744        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2745
 2746        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2747        to variant_id
 2748        :type variant_id_column: str (optional)
 2749        :param force: If True, the variant_id column will be created even if it already exists
 2750        :type force: bool
 2751        :return: The name of the column that contains the variant_id
 2752        """
 2753
 2754        # Assembly
 2755        assembly = self.get_param().get(
 2756            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2757        )
 2758
 2759        # INFO/Tag prefix
 2760        prefix = self.get_explode_infos_prefix()
 2761
 2762        # Explode INFO/SVTYPE
 2763        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2764
 2765        # variants table
 2766        table_variants = self.get_table_variants()
 2767
 2768        # variant_id column
 2769        if not variant_id_column:
 2770            variant_id_column = "variant_id"
 2771
 2772        # Creta variant_id column
 2773        if "variant_id" not in self.get_extra_infos() or force:
 2774
 2775            # Create column
 2776            self.add_column(
 2777                table_name=table_variants,
 2778                column_name=variant_id_column,
 2779                column_type="UBIGINT",
 2780                default_value="0",
 2781            )
 2782
 2783            # Update column
 2784            self.conn.execute(
 2785                f"""
 2786                    UPDATE {table_variants}
 2787                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2788                """
 2789            )
 2790
 2791        # Remove added columns
 2792        for added_column in added_columns:
 2793            self.drop_column(column=added_column)
 2794
 2795        # return variant_id column name
 2796        return variant_id_column
 2797
 2798    def get_variant_id_column(
 2799        self, variant_id_column: str = "variant_id", force: bool = None
 2800    ) -> str:
 2801        """
 2802        This function returns the variant_id column name
 2803
 2804        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2805        defaults to variant_id
 2806        :type variant_id_column: str (optional)
 2807        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2808        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2809        if it is not already set, or if it is set
 2810        :type force: bool
 2811        :return: The variant_id column name.
 2812        """
 2813
 2814        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2815
 2816    ###
 2817    # Annotation
 2818    ###
 2819
 2820    def scan_databases(
 2821        self,
 2822        database_formats: list = ["parquet"],
 2823        database_releases: list = ["current"],
 2824    ) -> dict:
 2825        """
 2826        The function `scan_databases` scans for available databases based on specified formats and
 2827        releases.
 2828
 2829        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2830        of the databases to be scanned. In this case, the accepted format is "parquet"
 2831        :type database_formats: list ["parquet"]
 2832        :param database_releases: The `database_releases` parameter is a list that specifies the
 2833        releases of the databases to be scanned. In the provided function, the default value for
 2834        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2835        databases that are in the "current"
 2836        :type database_releases: list
 2837        :return: The function `scan_databases` returns a dictionary containing information about
 2838        databases that match the specified formats and releases.
 2839        """
 2840
 2841        # Config
 2842        config = self.get_config()
 2843
 2844        # Param
 2845        param = self.get_param()
 2846
 2847        # Param - Assembly
 2848        assembly = param.get("assembly", config.get("assembly", None))
 2849        if not assembly:
 2850            assembly = DEFAULT_ASSEMBLY
 2851            log.warning(f"Default assembly '{assembly}'")
 2852
 2853        # Scan for availabled databases
 2854        log.info(
 2855            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2856        )
 2857        databases_infos_dict = databases_infos(
 2858            database_folder_releases=database_releases,
 2859            database_formats=database_formats,
 2860            assembly=assembly,
 2861            config=config,
 2862        )
 2863        log.info(
 2864            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2865        )
 2866
 2867        return databases_infos_dict
 2868
 2869    def annotation(self) -> None:
 2870        """
 2871        It annotates the VCF file with the annotations specified in the config file.
 2872        """
 2873
 2874        # Config
 2875        config = self.get_config()
 2876
 2877        # Param
 2878        param = self.get_param()
 2879
 2880        # Param - Assembly
 2881        assembly = param.get("assembly", config.get("assembly", None))
 2882        if not assembly:
 2883            assembly = DEFAULT_ASSEMBLY
 2884            log.warning(f"Default assembly '{assembly}'")
 2885
 2886        # annotations databases folders
 2887        annotations_databases = set(
 2888            config.get("folders", {})
 2889            .get("databases", {})
 2890            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2891            + config.get("folders", {})
 2892            .get("databases", {})
 2893            .get("parquet", ["~/howard/databases/parquet/current"])
 2894            + config.get("folders", {})
 2895            .get("databases", {})
 2896            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2897        )
 2898
 2899        # Get param annotations
 2900        if param.get("annotations", None) and isinstance(
 2901            param.get("annotations", None), str
 2902        ):
 2903            log.debug(param.get("annotations", None))
 2904            param_annotation_list = param.get("annotations").split(",")
 2905        else:
 2906            param_annotation_list = []
 2907
 2908        # Each tools param
 2909        if param.get("annotation_parquet", None) != None:
 2910            log.debug(
 2911                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2912            )
 2913            if isinstance(param.get("annotation_parquet", None), list):
 2914                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2915            else:
 2916                param_annotation_list.append(param.get("annotation_parquet"))
 2917        if param.get("annotation_snpsift", None) != None:
 2918            if isinstance(param.get("annotation_snpsift", None), list):
 2919                param_annotation_list.append(
 2920                    "snpsift:"
 2921                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2922                )
 2923            else:
 2924                param_annotation_list.append(
 2925                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2926                )
 2927        if param.get("annotation_snpeff", None) != None:
 2928            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2929        if param.get("annotation_bcftools", None) != None:
 2930            if isinstance(param.get("annotation_bcftools", None), list):
 2931                param_annotation_list.append(
 2932                    "bcftools:"
 2933                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2934                )
 2935            else:
 2936                param_annotation_list.append(
 2937                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2938                )
 2939        if param.get("annotation_annovar", None) != None:
 2940            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2941        if param.get("annotation_exomiser", None) != None:
 2942            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2943        if param.get("annotation_splice", None) != None:
 2944            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2945
 2946        # Merge param annotations list
 2947        param["annotations"] = ",".join(param_annotation_list)
 2948
 2949        # debug
 2950        log.debug(f"param_annotations={param['annotations']}")
 2951
 2952        if param.get("annotations"):
 2953
 2954            # Log
 2955            # log.info("Annotations - Check annotation parameters")
 2956
 2957            if not "annotation" in param:
 2958                param["annotation"] = {}
 2959
 2960            # List of annotations parameters
 2961            annotations_list_input = {}
 2962            if isinstance(param.get("annotations", None), str):
 2963                annotation_file_list = [
 2964                    value for value in param.get("annotations", "").split(",")
 2965                ]
 2966                for annotation_file in annotation_file_list:
 2967                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2968            else:
 2969                annotations_list_input = param.get("annotations", {})
 2970
 2971            log.info(f"Quick Annotations:")
 2972            for annotation_key in list(annotations_list_input.keys()):
 2973                log.info(f"   {annotation_key}")
 2974
 2975            # List of annotations and associated fields
 2976            annotations_list = {}
 2977
 2978            for annotation_file in annotations_list_input:
 2979
 2980                # Explode annotations if ALL
 2981                if (
 2982                    annotation_file.upper() == "ALL"
 2983                    or annotation_file.upper().startswith("ALL:")
 2984                ):
 2985
 2986                    # check ALL parameters (formats, releases)
 2987                    annotation_file_split = annotation_file.split(":")
 2988                    database_formats = "parquet"
 2989                    database_releases = "current"
 2990                    for annotation_file_option in annotation_file_split[1:]:
 2991                        database_all_options_split = annotation_file_option.split("=")
 2992                        if database_all_options_split[0] == "format":
 2993                            database_formats = database_all_options_split[1].split("+")
 2994                        if database_all_options_split[0] == "release":
 2995                            database_releases = database_all_options_split[1].split("+")
 2996
 2997                    # Scan for availabled databases
 2998                    databases_infos_dict = self.scan_databases(
 2999                        database_formats=database_formats,
 3000                        database_releases=database_releases,
 3001                    )
 3002
 3003                    # Add found databases in annotation parameters
 3004                    for database_infos in databases_infos_dict.keys():
 3005                        annotations_list[database_infos] = {"INFO": None}
 3006
 3007                else:
 3008                    annotations_list[annotation_file] = annotations_list_input[
 3009                        annotation_file
 3010                    ]
 3011
 3012            # Check each databases
 3013            if len(annotations_list):
 3014
 3015                log.info(
 3016                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3017                )
 3018
 3019                for annotation_file in annotations_list:
 3020
 3021                    # Init
 3022                    annotations = annotations_list.get(annotation_file, None)
 3023
 3024                    # Annotation snpEff
 3025                    if annotation_file.startswith("snpeff"):
 3026
 3027                        log.debug(f"Quick Annotation snpEff")
 3028
 3029                        if "snpeff" not in param["annotation"]:
 3030                            param["annotation"]["snpeff"] = {}
 3031
 3032                        if "options" not in param["annotation"]["snpeff"]:
 3033                            param["annotation"]["snpeff"]["options"] = ""
 3034
 3035                        # snpEff options in annotations
 3036                        param["annotation"]["snpeff"]["options"] = "".join(
 3037                            annotation_file.split(":")[1:]
 3038                        )
 3039
 3040                    # Annotation Annovar
 3041                    elif annotation_file.startswith("annovar"):
 3042
 3043                        log.debug(f"Quick Annotation Annovar")
 3044
 3045                        if "annovar" not in param["annotation"]:
 3046                            param["annotation"]["annovar"] = {}
 3047
 3048                        if "annotations" not in param["annotation"]["annovar"]:
 3049                            param["annotation"]["annovar"]["annotations"] = {}
 3050
 3051                        # Options
 3052                        annotation_file_split = annotation_file.split(":")
 3053                        for annotation_file_annotation in annotation_file_split[1:]:
 3054                            if annotation_file_annotation:
 3055                                param["annotation"]["annovar"]["annotations"][
 3056                                    annotation_file_annotation
 3057                                ] = annotations
 3058
 3059                    # Annotation Exomiser
 3060                    elif annotation_file.startswith("exomiser"):
 3061
 3062                        log.debug(f"Quick Annotation Exomiser")
 3063
 3064                        param["annotation"]["exomiser"] = params_string_to_dict(
 3065                            annotation_file
 3066                        )
 3067
 3068                    # Annotation Splice
 3069                    elif annotation_file.startswith("splice"):
 3070
 3071                        log.debug(f"Quick Annotation Splice")
 3072
 3073                        param["annotation"]["splice"] = params_string_to_dict(
 3074                            annotation_file
 3075                        )
 3076
 3077                    # Annotation Parquet or BCFTOOLS
 3078                    else:
 3079
 3080                        # Tools detection
 3081                        if annotation_file.startswith("bcftools:"):
 3082                            annotation_tool_initial = "bcftools"
 3083                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3084                        elif annotation_file.startswith("snpsift:"):
 3085                            annotation_tool_initial = "snpsift"
 3086                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3087                        elif annotation_file.startswith("bigwig:"):
 3088                            annotation_tool_initial = "bigwig"
 3089                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3090                        else:
 3091                            annotation_tool_initial = None
 3092
 3093                        # list of files
 3094                        annotation_file_list = annotation_file.replace("+", ":").split(
 3095                            ":"
 3096                        )
 3097
 3098                        for annotation_file in annotation_file_list:
 3099
 3100                            if annotation_file:
 3101
 3102                                # Annotation tool initial
 3103                                annotation_tool = annotation_tool_initial
 3104
 3105                                # Find file
 3106                                annotation_file_found = None
 3107
 3108                                if os.path.exists(annotation_file):
 3109                                    annotation_file_found = annotation_file
 3110                                elif os.path.exists(full_path(annotation_file)):
 3111                                    annotation_file_found = full_path(annotation_file)
 3112                                else:
 3113                                    # Find within assembly folders
 3114                                    for annotations_database in annotations_databases:
 3115                                        found_files = find_all(
 3116                                            annotation_file,
 3117                                            os.path.join(
 3118                                                annotations_database, assembly
 3119                                            ),
 3120                                        )
 3121                                        if len(found_files) > 0:
 3122                                            annotation_file_found = found_files[0]
 3123                                            break
 3124                                    if not annotation_file_found and not assembly:
 3125                                        # Find within folders
 3126                                        for (
 3127                                            annotations_database
 3128                                        ) in annotations_databases:
 3129                                            found_files = find_all(
 3130                                                annotation_file, annotations_database
 3131                                            )
 3132                                            if len(found_files) > 0:
 3133                                                annotation_file_found = found_files[0]
 3134                                                break
 3135                                log.debug(
 3136                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3137                                )
 3138
 3139                                # Full path
 3140                                annotation_file_found = full_path(annotation_file_found)
 3141
 3142                                if annotation_file_found:
 3143
 3144                                    database = Database(database=annotation_file_found)
 3145                                    quick_annotation_format = database.get_format()
 3146                                    quick_annotation_is_compressed = (
 3147                                        database.is_compressed()
 3148                                    )
 3149                                    quick_annotation_is_indexed = os.path.exists(
 3150                                        f"{annotation_file_found}.tbi"
 3151                                    )
 3152                                    bcftools_preference = False
 3153
 3154                                    # Check Annotation Tool
 3155                                    if not annotation_tool:
 3156                                        if (
 3157                                            bcftools_preference
 3158                                            and quick_annotation_format
 3159                                            in ["vcf", "bed"]
 3160                                            and quick_annotation_is_compressed
 3161                                            and quick_annotation_is_indexed
 3162                                        ):
 3163                                            annotation_tool = "bcftools"
 3164                                        elif quick_annotation_format in [
 3165                                            "vcf",
 3166                                            "bed",
 3167                                            "tsv",
 3168                                            "tsv",
 3169                                            "csv",
 3170                                            "json",
 3171                                            "tbl",
 3172                                            "parquet",
 3173                                            "duckdb",
 3174                                        ]:
 3175                                            annotation_tool = "parquet"
 3176                                        elif quick_annotation_format in [
 3177                                            "bw"
 3178                                        ]:
 3179                                            annotation_tool = "bigwig"
 3180                                        else:
 3181                                            log.error(
 3182                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3183                                            )
 3184                                            raise ValueError(
 3185                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3186                                            )
 3187
 3188                                    log.debug(
 3189                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3190                                    )
 3191
 3192                                    # Annotation Tool dispatch
 3193                                    if annotation_tool:
 3194                                        if annotation_tool not in param["annotation"]:
 3195                                            param["annotation"][annotation_tool] = {}
 3196                                        if (
 3197                                            "annotations"
 3198                                            not in param["annotation"][annotation_tool]
 3199                                        ):
 3200                                            param["annotation"][annotation_tool][
 3201                                                "annotations"
 3202                                            ] = {}
 3203                                        param["annotation"][annotation_tool][
 3204                                            "annotations"
 3205                                        ][annotation_file_found] = annotations
 3206
 3207                                else:
 3208                                    log.warning(
 3209                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3210                                    )
 3211
 3212                self.set_param(param)
 3213
 3214        if param.get("annotation", None):
 3215            log.info("Annotations")
 3216            if param.get("annotation", {}).get("parquet", None):
 3217                log.info("Annotations 'parquet'...")
 3218                self.annotation_parquet()
 3219            if param.get("annotation", {}).get("bcftools", None):
 3220                log.info("Annotations 'bcftools'...")
 3221                self.annotation_bcftools()
 3222            if param.get("annotation", {}).get("snpsift", None):
 3223                log.info("Annotations 'snpsift'...")
 3224                self.annotation_snpsift()
 3225            if param.get("annotation", {}).get("bigwig", None):
 3226                log.info("Annotations 'bigwig'...")
 3227                self.annotation_bigwig()
 3228            if param.get("annotation", {}).get("annovar", None):
 3229                log.info("Annotations 'annovar'...")
 3230                self.annotation_annovar()
 3231            if param.get("annotation", {}).get("snpeff", None):
 3232                log.info("Annotations 'snpeff'...")
 3233                self.annotation_snpeff()
 3234            if param.get("annotation", {}).get("exomiser", None) is not None:
 3235                log.info("Annotations 'exomiser'...")
 3236                self.annotation_exomiser()
 3237            if param.get("annotation", {}).get("splice", None) is not None:
 3238                log.info("Annotations 'splice' ...")
 3239                self.annotation_splice()
 3240
 3241        # Explode INFOS fields into table fields
 3242        if self.get_explode_infos():
 3243            self.explode_infos(
 3244                prefix=self.get_explode_infos_prefix(),
 3245                fields=self.get_explode_infos_fields(),
 3246                force=True,
 3247            )
 3248
 3249
 3250    def annotation_bigwig(self, threads: int = None) -> None:
 3251        """
 3252        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3253        
 3254        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3255        number of threads to be used for parallel processing during the annotation process. If the
 3256        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3257        threads to use based on the system configuration
 3258        :type threads: int
 3259        :return: True
 3260        """
 3261
 3262        # DEBUG
 3263        log.debug("Start annotation with bigwig databases")
 3264
 3265        # # Threads
 3266        # if not threads:
 3267        #     threads = self.get_threads()
 3268        # log.debug("Threads: " + str(threads))
 3269
 3270        # Config
 3271        config = self.get_config()
 3272        log.debug("Config: " + str(config))
 3273
 3274        # Config - BCFTools databases folders
 3275        databases_folders = set(
 3276            self.get_config()
 3277            .get("folders", {})
 3278            .get("databases", {})
 3279            .get("annotations", ["."])
 3280            + self.get_config()
 3281            .get("folders", {})
 3282            .get("databases", {})
 3283            .get("bigwig", ["."])
 3284        )
 3285        log.debug("Databases annotations: " + str(databases_folders))
 3286
 3287        # Param
 3288        annotations = (
 3289            self.get_param()
 3290            .get("annotation", {})
 3291            .get("bigwig", {})
 3292            .get("annotations", None)
 3293        )
 3294        log.debug("Annotations: " + str(annotations))
 3295
 3296        # Assembly
 3297        assembly = self.get_param().get(
 3298            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3299        )
 3300
 3301        # Data
 3302        table_variants = self.get_table_variants()
 3303
 3304        # Check if not empty
 3305        log.debug("Check if not empty")
 3306        sql_query_chromosomes = (
 3307            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3308        )
 3309        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3310        if not sql_query_chromosomes_df["count"][0]:
 3311            log.info(f"VCF empty")
 3312            return
 3313
 3314        # VCF header
 3315        vcf_reader = self.get_header()
 3316        log.debug("Initial header: " + str(vcf_reader.infos))
 3317
 3318        # Existing annotations
 3319        for vcf_annotation in self.get_header().infos:
 3320
 3321            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3322            log.debug(
 3323                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3324            )
 3325
 3326        if annotations:
 3327
 3328            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3329
 3330                # Export VCF file
 3331                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3332
 3333                # annotation_bigwig_config
 3334                annotation_bigwig_config_list = []
 3335
 3336                for annotation in annotations:
 3337                    annotation_fields = annotations[annotation]
 3338
 3339                    # Annotation Name
 3340                    annotation_name = os.path.basename(annotation)
 3341
 3342                    if not annotation_fields:
 3343                        annotation_fields = {"INFO": None}
 3344
 3345                    log.debug(f"Annotation '{annotation_name}'")
 3346                    log.debug(
 3347                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3348                    )
 3349
 3350                    # Create Database
 3351                    database = Database(
 3352                        database=annotation,
 3353                        databases_folders=databases_folders,
 3354                        assembly=assembly,
 3355                    )
 3356
 3357                    # Find files
 3358                    db_file = database.get_database()
 3359                    db_file = full_path(db_file)
 3360                    db_hdr_file = database.get_header_file()
 3361                    db_hdr_file = full_path(db_hdr_file)
 3362                    db_file_type = database.get_format()
 3363
 3364                    # If db_file is http ?
 3365                    if database.get_database().startswith("http"):
 3366
 3367                        # Datbase is HTTP URL
 3368                        db_file_is_http = True
 3369
 3370                        # DB file keep as URL
 3371                        db_file = database.get_database()
 3372                        log.warning(f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)")
 3373
 3374                        # Retrieve automatic annotation field name
 3375                        annotation_field = clean_annotation_field(os.path.basename(db_file).replace(".bw", ""))
 3376                        log.debug(f"Create header file with annotation field '{annotation_field}' is an HTTP URL")
 3377
 3378                        # Create automatic header file
 3379                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3380                        with open(db_hdr_file, 'w') as f:
 3381                            f.write("##fileformat=VCFv4.2\n")
 3382                            f.write(f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""")
 3383                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3384
 3385                    else:
 3386
 3387                        # Datbase is NOT HTTP URL
 3388                        db_file_is_http = False
 3389                    
 3390
 3391                    # Check index - try to create if not exists
 3392                    if db_file is None or db_hdr_file is None or (not os.path.exists(db_file) and not db_file_is_http) or not os.path.exists(db_hdr_file) or not db_file_type in ["bw"]:
 3393                    #if False:
 3394                        log.error("Annotation failed: database not valid")
 3395                        log.error(f"Annotation annotation file: {db_file}")
 3396                        log.error(f"Annotation annotation file type: {db_file_type}")
 3397                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3398                        raise ValueError(
 3399                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3400                        )
 3401                    else:
 3402
 3403                        # Log
 3404                        log.debug(
 3405                            f"Annotation '{annotation}' - file: "
 3406                            + str(db_file)
 3407                            + " and "
 3408                            + str(db_hdr_file)
 3409                        )
 3410
 3411                        # Load header as VCF object
 3412                        db_hdr_vcf = Variants(input=db_hdr_file)
 3413                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3414                        log.debug(
 3415                            "Annotation database header: "
 3416                            + str(db_hdr_vcf_header_infos)
 3417                        )
 3418
 3419                        # For all fields in database
 3420                        annotation_fields_full = False
 3421                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3422                            annotation_fields = {
 3423                                key: key for key in db_hdr_vcf_header_infos
 3424                            }
 3425                            log.debug(
 3426                                "Annotation database header - All annotations added: "
 3427                                + str(annotation_fields)
 3428                            )
 3429                            annotation_fields_full = True
 3430
 3431                        # Init
 3432                        cyvcf2_header_rename_dict = {}
 3433                        cyvcf2_header_list = []
 3434                        cyvcf2_header_indexes = {}
 3435
 3436                        # process annotation fields
 3437                        for annotation_field in annotation_fields:
 3438
 3439                            # New annotation name 
 3440                            annotation_field_new = annotation_fields[annotation_field]
 3441
 3442                            # Check annotation field and index in header
 3443                            if annotation_field in db_hdr_vcf.get_header_columns_as_list():
 3444                                annotation_field_index = db_hdr_vcf.get_header_columns_as_list().index(annotation_field)-3
 3445                                cyvcf2_header_indexes[annotation_field_new] = annotation_field_index
 3446                            else:
 3447                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3448                                log.error(msg_err)
 3449                                raise ValueError(msg_err)
 3450
 3451                            # Append annotation field in cyvcf2 header list
 3452                            cyvcf2_header_rename_dict[annotation_field_new] = db_hdr_vcf_header_infos[annotation_field].id
 3453                            cyvcf2_header_list.append(
 3454                                {
 3455                                    "ID": annotation_field_new,
 3456                                    "Number": db_hdr_vcf_header_infos[annotation_field].num,
 3457                                    "Type": db_hdr_vcf_header_infos[annotation_field].type,
 3458                                    "Description": db_hdr_vcf_header_infos[annotation_field].desc,
 3459                                }
 3460                            )
 3461
 3462                        # Load bigwig database
 3463                        bw_db = pyBigWig.open(db_file)
 3464                        if bw_db.isBigWig():
 3465                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3466                        else:
 3467                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3468                            log.error(msg_err)
 3469                            raise ValueError(msg_err)
 3470
 3471                        annotation_bigwig_config_list.append(
 3472                            {
 3473                                "db_file": db_file,
 3474                                "bw_db": bw_db,
 3475                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3476                                "cyvcf2_header_list": cyvcf2_header_list,
 3477                                "cyvcf2_header_indexes": cyvcf2_header_indexes
 3478                            }
 3479                        )
 3480
 3481                # Annotate
 3482                if annotation_bigwig_config_list:
 3483
 3484                    # Annotation config
 3485                    log.debug(f"annotation_bigwig_config={annotation_bigwig_config_list}")
 3486
 3487                    # Export VCF file
 3488                    self.export_variant_vcf(
 3489                        vcf_file=tmp_vcf_name,
 3490                        remove_info=True,
 3491                        add_samples=False,
 3492                        index=True,
 3493                    )
 3494
 3495                    # Load input tmp file
 3496                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3497
 3498                    # Add header in input file
 3499                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3500                        for cyvcf2_header_field in annotation_bigwig_config.get("cyvcf2_header_list",[]):
 3501                            log.info(f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'")
 3502                            input_vcf.add_info_to_header(
 3503                                cyvcf2_header_field
 3504                            )
 3505
 3506                    # Create output VCF file
 3507                    output_vcf_file = os.path.join(tmp_dir,"output.vcf.gz")
 3508                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3509
 3510                    # Fetch variants
 3511                    log.info(f"Annotations 'bigwig' start...")
 3512                    for variant in input_vcf:
 3513
 3514                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3515
 3516                            # DB and indexes
 3517                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3518                            cyvcf2_header_indexes = annotation_bigwig_config.get("cyvcf2_header_indexes", None)
 3519
 3520                            # Retrieve value from chrom pos
 3521                            res = bw_db.values(variant.CHROM, variant.POS - 1, variant.POS)
 3522                            
 3523                            # For each annotation fields (and indexes)
 3524                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3525
 3526                                # If value is NOT nNone
 3527                                if not np.isnan(res[cyvcf2_header_indexes[cyvcf2_header_index]]):
 3528                                    variant.INFO[cyvcf2_header_index] = res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3529
 3530                        # Add record in output file
 3531                        output_vcf.write_record(variant)
 3532
 3533                    # Log
 3534                    log.debug(f"Annotation done.")
 3535
 3536                    # Close and write file
 3537                    log.info(f"Annotations 'bigwig' write...")
 3538                    output_vcf.close()
 3539                    log.debug(f"Write done.")
 3540
 3541                    # Update variants
 3542                    log.info(f"Annotations 'bigwig' update...")
 3543                    self.update_from_vcf(output_vcf_file)
 3544                    log.debug(f"Update done.")
 3545
 3546        return True
 3547
 3548
 3549    def annotation_snpsift(self, threads: int = None) -> None:
 3550        """
 3551        This function annotate with bcftools
 3552
 3553        :param threads: Number of threads to use
 3554        :return: the value of the variable "return_value".
 3555        """
 3556
 3557        # DEBUG
 3558        log.debug("Start annotation with bcftools databases")
 3559
 3560        # Threads
 3561        if not threads:
 3562            threads = self.get_threads()
 3563        log.debug("Threads: " + str(threads))
 3564
 3565        # Config
 3566        config = self.get_config()
 3567        log.debug("Config: " + str(config))
 3568
 3569        # Config - snpSift
 3570        snpsift_bin_command = get_bin_command(
 3571            bin="SnpSift.jar",
 3572            tool="snpsift",
 3573            bin_type="jar",
 3574            config=config,
 3575            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3576        )
 3577        if not snpsift_bin_command:
 3578            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3579            log.error(msg_err)
 3580            raise ValueError(msg_err)
 3581
 3582        # Config - bcftools
 3583        bcftools_bin_command = get_bin_command(
 3584            bin="bcftools",
 3585            tool="bcftools",
 3586            bin_type="bin",
 3587            config=config,
 3588            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3589        )
 3590        if not bcftools_bin_command:
 3591            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3592            log.error(msg_err)
 3593            raise ValueError(msg_err)
 3594
 3595        # Config - BCFTools databases folders
 3596        databases_folders = set(
 3597            self.get_config()
 3598            .get("folders", {})
 3599            .get("databases", {})
 3600            .get("annotations", ["."])
 3601            + self.get_config()
 3602            .get("folders", {})
 3603            .get("databases", {})
 3604            .get("bcftools", ["."])
 3605        )
 3606        log.debug("Databases annotations: " + str(databases_folders))
 3607
 3608        # Param
 3609        annotations = (
 3610            self.get_param()
 3611            .get("annotation", {})
 3612            .get("snpsift", {})
 3613            .get("annotations", None)
 3614        )
 3615        log.debug("Annotations: " + str(annotations))
 3616
 3617        # Assembly
 3618        assembly = self.get_param().get(
 3619            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3620        )
 3621
 3622        # Data
 3623        table_variants = self.get_table_variants()
 3624
 3625        # Check if not empty
 3626        log.debug("Check if not empty")
 3627        sql_query_chromosomes = (
 3628            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3629        )
 3630        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3631        if not sql_query_chromosomes_df["count"][0]:
 3632            log.info(f"VCF empty")
 3633            return
 3634
 3635        # VCF header
 3636        vcf_reader = self.get_header()
 3637        log.debug("Initial header: " + str(vcf_reader.infos))
 3638
 3639        # Existing annotations
 3640        for vcf_annotation in self.get_header().infos:
 3641
 3642            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3643            log.debug(
 3644                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3645            )
 3646
 3647        if annotations:
 3648
 3649            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3650
 3651                # Export VCF file
 3652                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3653
 3654                # Init
 3655                commands = {}
 3656
 3657                for annotation in annotations:
 3658                    annotation_fields = annotations[annotation]
 3659
 3660                    # Annotation Name
 3661                    annotation_name = os.path.basename(annotation)
 3662
 3663                    if not annotation_fields:
 3664                        annotation_fields = {"INFO": None}
 3665
 3666                    log.debug(f"Annotation '{annotation_name}'")
 3667                    log.debug(
 3668                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3669                    )
 3670
 3671                    # Create Database
 3672                    database = Database(
 3673                        database=annotation,
 3674                        databases_folders=databases_folders,
 3675                        assembly=assembly,
 3676                    )
 3677
 3678                    # Find files
 3679                    db_file = database.get_database()
 3680                    db_file = full_path(db_file)
 3681                    db_hdr_file = database.get_header_file()
 3682                    db_hdr_file = full_path(db_hdr_file)
 3683                    db_file_type = database.get_format()
 3684                    db_tbi_file = f"{db_file}.tbi"
 3685                    db_file_compressed = database.is_compressed()
 3686
 3687                    # Check if compressed
 3688                    if not db_file_compressed:
 3689                        log.error(
 3690                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3691                        )
 3692                        raise ValueError(
 3693                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3694                        )
 3695
 3696                    # Check if indexed
 3697                    if not os.path.exists(db_tbi_file):
 3698                        log.error(
 3699                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3700                        )
 3701                        raise ValueError(
 3702                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3703                        )
 3704
 3705                    # Check index - try to create if not exists
 3706                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3707                        log.error("Annotation failed: database not valid")
 3708                        log.error(f"Annotation annotation file: {db_file}")
 3709                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3710                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3711                        raise ValueError(
 3712                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3713                        )
 3714                    else:
 3715
 3716                        log.debug(
 3717                            f"Annotation '{annotation}' - file: "
 3718                            + str(db_file)
 3719                            + " and "
 3720                            + str(db_hdr_file)
 3721                        )
 3722
 3723                        # Load header as VCF object
 3724                        db_hdr_vcf = Variants(input=db_hdr_file)
 3725                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3726                        log.debug(
 3727                            "Annotation database header: "
 3728                            + str(db_hdr_vcf_header_infos)
 3729                        )
 3730
 3731                        # For all fields in database
 3732                        annotation_fields_full = False
 3733                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3734                            annotation_fields = {
 3735                                key: key for key in db_hdr_vcf_header_infos
 3736                            }
 3737                            log.debug(
 3738                                "Annotation database header - All annotations added: "
 3739                                + str(annotation_fields)
 3740                            )
 3741                            annotation_fields_full = True
 3742
 3743                        # # Create file for field rename
 3744                        # log.debug("Create file for field rename")
 3745                        # tmp_rename = NamedTemporaryFile(
 3746                        #     prefix=self.get_prefix(),
 3747                        #     dir=self.get_tmp_dir(),
 3748                        #     suffix=".rename",
 3749                        #     delete=False,
 3750                        # )
 3751                        # tmp_rename_name = tmp_rename.name
 3752                        # tmp_files.append(tmp_rename_name)
 3753
 3754                        # Number of fields
 3755                        nb_annotation_field = 0
 3756                        annotation_list = []
 3757                        annotation_infos_rename_list = []
 3758
 3759                        for annotation_field in annotation_fields:
 3760
 3761                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3762                            annotation_fields_new_name = annotation_fields.get(
 3763                                annotation_field, annotation_field
 3764                            )
 3765                            if not annotation_fields_new_name:
 3766                                annotation_fields_new_name = annotation_field
 3767
 3768                            # Check if field is in DB and if field is not elready in input data
 3769                            if (
 3770                                annotation_field in db_hdr_vcf.get_header().infos
 3771                                and annotation_fields_new_name
 3772                                not in self.get_header().infos
 3773                            ):
 3774
 3775                                log.info(
 3776                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3777                                )
 3778
 3779                                # BCFTools annotate param to rename fields
 3780                                if annotation_field != annotation_fields_new_name:
 3781                                    annotation_infos_rename_list.append(
 3782                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3783                                    )
 3784
 3785                                # Add INFO field to header
 3786                                db_hdr_vcf_header_infos_number = (
 3787                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3788                                )
 3789                                db_hdr_vcf_header_infos_type = (
 3790                                    db_hdr_vcf_header_infos[annotation_field].type
 3791                                    or "String"
 3792                                )
 3793                                db_hdr_vcf_header_infos_description = (
 3794                                    db_hdr_vcf_header_infos[annotation_field].desc
 3795                                    or f"{annotation_field} description"
 3796                                )
 3797                                db_hdr_vcf_header_infos_source = (
 3798                                    db_hdr_vcf_header_infos[annotation_field].source
 3799                                    or "unknown"
 3800                                )
 3801                                db_hdr_vcf_header_infos_version = (
 3802                                    db_hdr_vcf_header_infos[annotation_field].version
 3803                                    or "unknown"
 3804                                )
 3805
 3806                                vcf_reader.infos[annotation_fields_new_name] = (
 3807                                    vcf.parser._Info(
 3808                                        annotation_fields_new_name,
 3809                                        db_hdr_vcf_header_infos_number,
 3810                                        db_hdr_vcf_header_infos_type,
 3811                                        db_hdr_vcf_header_infos_description,
 3812                                        db_hdr_vcf_header_infos_source,
 3813                                        db_hdr_vcf_header_infos_version,
 3814                                        self.code_type_map[
 3815                                            db_hdr_vcf_header_infos_type
 3816                                        ],
 3817                                    )
 3818                                )
 3819
 3820                                annotation_list.append(annotation_field)
 3821
 3822                                nb_annotation_field += 1
 3823
 3824                            else:
 3825
 3826                                if (
 3827                                    annotation_field
 3828                                    not in db_hdr_vcf.get_header().infos
 3829                                ):
 3830                                    log.warning(
 3831                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3832                                    )
 3833                                if (
 3834                                    annotation_fields_new_name
 3835                                    in self.get_header().infos
 3836                                ):
 3837                                    log.warning(
 3838                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3839                                    )
 3840
 3841                        log.info(
 3842                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3843                        )
 3844
 3845                        annotation_infos = ",".join(annotation_list)
 3846
 3847                        if annotation_infos != "":
 3848
 3849                            # Annotated VCF (and error file)
 3850                            tmp_annotation_vcf_name = os.path.join(
 3851                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3852                            )
 3853                            tmp_annotation_vcf_name_err = (
 3854                                tmp_annotation_vcf_name + ".err"
 3855                            )
 3856
 3857                            # Add fields to annotate
 3858                            if not annotation_fields_full:
 3859                                annotation_infos_option = f"-info {annotation_infos}"
 3860                            else:
 3861                                annotation_infos_option = ""
 3862
 3863                            # Info fields rename
 3864                            if annotation_infos_rename_list:
 3865                                annotation_infos_rename = " -c " + ",".join(
 3866                                    annotation_infos_rename_list
 3867                                )
 3868                            else:
 3869                                annotation_infos_rename = ""
 3870
 3871                            # Annotate command
 3872                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3873
 3874                            # Add command
 3875                            commands[command_annotate] = tmp_annotation_vcf_name
 3876
 3877                if commands:
 3878
 3879                    # Export VCF file
 3880                    self.export_variant_vcf(
 3881                        vcf_file=tmp_vcf_name,
 3882                        remove_info=True,
 3883                        add_samples=False,
 3884                        index=True,
 3885                    )
 3886                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3887
 3888                    # Num command
 3889                    nb_command = 0
 3890
 3891                    # Annotate
 3892                    for command_annotate in commands:
 3893                        nb_command += 1
 3894                        log.info(
 3895                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3896                        )
 3897                        log.debug(f"command_annotate={command_annotate}")
 3898                        run_parallel_commands([command_annotate], threads)
 3899
 3900                        # Debug
 3901                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3902
 3903                        # Update variants
 3904                        log.info(
 3905                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3906                        )
 3907                        self.update_from_vcf(commands[command_annotate])
 3908
 3909
 3910    def annotation_bcftools(self, threads: int = None) -> None:
 3911        """
 3912        This function annotate with bcftools
 3913
 3914        :param threads: Number of threads to use
 3915        :return: the value of the variable "return_value".
 3916        """
 3917
 3918        # DEBUG
 3919        log.debug("Start annotation with bcftools databases")
 3920
 3921        # Threads
 3922        if not threads:
 3923            threads = self.get_threads()
 3924        log.debug("Threads: " + str(threads))
 3925
 3926        # Config
 3927        config = self.get_config()
 3928        log.debug("Config: " + str(config))
 3929
 3930        # DEBUG
 3931        delete_tmp = True
 3932        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3933            delete_tmp = False
 3934            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3935
 3936        # Config - BCFTools bin command
 3937        bcftools_bin_command = get_bin_command(
 3938            bin="bcftools",
 3939            tool="bcftools",
 3940            bin_type="bin",
 3941            config=config,
 3942            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3943        )
 3944        if not bcftools_bin_command:
 3945            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3946            log.error(msg_err)
 3947            raise ValueError(msg_err)
 3948
 3949        # Config - BCFTools databases folders
 3950        databases_folders = set(
 3951            self.get_config()
 3952            .get("folders", {})
 3953            .get("databases", {})
 3954            .get("annotations", ["."])
 3955            + self.get_config()
 3956            .get("folders", {})
 3957            .get("databases", {})
 3958            .get("bcftools", ["."])
 3959        )
 3960        log.debug("Databases annotations: " + str(databases_folders))
 3961
 3962        # Param
 3963        annotations = (
 3964            self.get_param()
 3965            .get("annotation", {})
 3966            .get("bcftools", {})
 3967            .get("annotations", None)
 3968        )
 3969        log.debug("Annotations: " + str(annotations))
 3970
 3971        # Assembly
 3972        assembly = self.get_param().get(
 3973            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3974        )
 3975
 3976        # Data
 3977        table_variants = self.get_table_variants()
 3978
 3979        # Check if not empty
 3980        log.debug("Check if not empty")
 3981        sql_query_chromosomes = (
 3982            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3983        )
 3984        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3985        if not sql_query_chromosomes_df["count"][0]:
 3986            log.info(f"VCF empty")
 3987            return
 3988
 3989        # Export in VCF
 3990        log.debug("Create initial file to annotate")
 3991        tmp_vcf = NamedTemporaryFile(
 3992            prefix=self.get_prefix(),
 3993            dir=self.get_tmp_dir(),
 3994            suffix=".vcf.gz",
 3995            delete=False,
 3996        )
 3997        tmp_vcf_name = tmp_vcf.name
 3998
 3999        # VCF header
 4000        vcf_reader = self.get_header()
 4001        log.debug("Initial header: " + str(vcf_reader.infos))
 4002
 4003        # Existing annotations
 4004        for vcf_annotation in self.get_header().infos:
 4005
 4006            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4007            log.debug(
 4008                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4009            )
 4010
 4011        if annotations:
 4012
 4013            tmp_ann_vcf_list = []
 4014            commands = []
 4015            tmp_files = []
 4016            err_files = []
 4017
 4018            for annotation in annotations:
 4019                annotation_fields = annotations[annotation]
 4020
 4021                # Annotation Name
 4022                annotation_name = os.path.basename(annotation)
 4023
 4024                if not annotation_fields:
 4025                    annotation_fields = {"INFO": None}
 4026
 4027                log.debug(f"Annotation '{annotation_name}'")
 4028                log.debug(
 4029                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4030                )
 4031
 4032                # Create Database
 4033                database = Database(
 4034                    database=annotation,
 4035                    databases_folders=databases_folders,
 4036                    assembly=assembly,
 4037                )
 4038
 4039                # Find files
 4040                db_file = database.get_database()
 4041                db_file = full_path(db_file)
 4042                db_hdr_file = database.get_header_file()
 4043                db_hdr_file = full_path(db_hdr_file)
 4044                db_file_type = database.get_format()
 4045                db_tbi_file = f"{db_file}.tbi"
 4046                db_file_compressed = database.is_compressed()
 4047
 4048                # Check if compressed
 4049                if not db_file_compressed:
 4050                    log.error(
 4051                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4052                    )
 4053                    raise ValueError(
 4054                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4055                    )
 4056
 4057                # Check if indexed
 4058                if not os.path.exists(db_tbi_file):
 4059                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4060                    raise ValueError(
 4061                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4062                    )
 4063
 4064                # Check index - try to create if not exists
 4065                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4066                    log.error("Annotation failed: database not valid")
 4067                    log.error(f"Annotation annotation file: {db_file}")
 4068                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4069                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4070                    raise ValueError(
 4071                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4072                    )
 4073                else:
 4074
 4075                    log.debug(
 4076                        f"Annotation '{annotation}' - file: "
 4077                        + str(db_file)
 4078                        + " and "
 4079                        + str(db_hdr_file)
 4080                    )
 4081
 4082                    # Load header as VCF object
 4083                    db_hdr_vcf = Variants(input=db_hdr_file)
 4084                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4085                    log.debug(
 4086                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4087                    )
 4088
 4089                    # For all fields in database
 4090                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4091                        annotation_fields = {
 4092                            key: key for key in db_hdr_vcf_header_infos
 4093                        }
 4094                        log.debug(
 4095                            "Annotation database header - All annotations added: "
 4096                            + str(annotation_fields)
 4097                        )
 4098
 4099                    # Number of fields
 4100                    nb_annotation_field = 0
 4101                    annotation_list = []
 4102
 4103                    for annotation_field in annotation_fields:
 4104
 4105                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4106                        annotation_fields_new_name = annotation_fields.get(
 4107                            annotation_field, annotation_field
 4108                        )
 4109                        if not annotation_fields_new_name:
 4110                            annotation_fields_new_name = annotation_field
 4111
 4112                        # Check if field is in DB and if field is not elready in input data
 4113                        if (
 4114                            annotation_field in db_hdr_vcf.get_header().infos
 4115                            and annotation_fields_new_name
 4116                            not in self.get_header().infos
 4117                        ):
 4118
 4119                            log.info(
 4120                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4121                            )
 4122
 4123                            # Add INFO field to header
 4124                            db_hdr_vcf_header_infos_number = (
 4125                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4126                            )
 4127                            db_hdr_vcf_header_infos_type = (
 4128                                db_hdr_vcf_header_infos[annotation_field].type
 4129                                or "String"
 4130                            )
 4131                            db_hdr_vcf_header_infos_description = (
 4132                                db_hdr_vcf_header_infos[annotation_field].desc
 4133                                or f"{annotation_field} description"
 4134                            )
 4135                            db_hdr_vcf_header_infos_source = (
 4136                                db_hdr_vcf_header_infos[annotation_field].source
 4137                                or "unknown"
 4138                            )
 4139                            db_hdr_vcf_header_infos_version = (
 4140                                db_hdr_vcf_header_infos[annotation_field].version
 4141                                or "unknown"
 4142                            )
 4143
 4144                            vcf_reader.infos[annotation_fields_new_name] = (
 4145                                vcf.parser._Info(
 4146                                    annotation_fields_new_name,
 4147                                    db_hdr_vcf_header_infos_number,
 4148                                    db_hdr_vcf_header_infos_type,
 4149                                    db_hdr_vcf_header_infos_description,
 4150                                    db_hdr_vcf_header_infos_source,
 4151                                    db_hdr_vcf_header_infos_version,
 4152                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4153                                )
 4154                            )
 4155
 4156                            # annotation_list.append(annotation_field)
 4157                            if annotation_field != annotation_fields_new_name:
 4158                                annotation_list.append(
 4159                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4160                                )
 4161                            else:
 4162                                annotation_list.append(annotation_field)
 4163
 4164                            nb_annotation_field += 1
 4165
 4166                        else:
 4167
 4168                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4169                                log.warning(
 4170                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4171                                )
 4172                            if annotation_fields_new_name in self.get_header().infos:
 4173                                log.warning(
 4174                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4175                                )
 4176
 4177                    log.info(
 4178                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4179                    )
 4180
 4181                    annotation_infos = ",".join(annotation_list)
 4182
 4183                    if annotation_infos != "":
 4184
 4185                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4186                        log.debug("Protect Header file - remove #CHROM line if exists")
 4187                        tmp_header_vcf = NamedTemporaryFile(
 4188                            prefix=self.get_prefix(),
 4189                            dir=self.get_tmp_dir(),
 4190                            suffix=".hdr",
 4191                            delete=False,
 4192                        )
 4193                        tmp_header_vcf_name = tmp_header_vcf.name
 4194                        tmp_files.append(tmp_header_vcf_name)
 4195                        # Command
 4196                        if db_hdr_file.endswith(".gz"):
 4197                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4198                        else:
 4199                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4200                        # Run
 4201                        run_parallel_commands([command_extract_header], 1)
 4202
 4203                        # Find chomosomes
 4204                        log.debug("Find chromosomes ")
 4205                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4206                        sql_query_chromosomes_df = self.get_query_to_df(
 4207                            sql_query_chromosomes
 4208                        )
 4209                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4210
 4211                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4212
 4213                        # BED columns in the annotation file
 4214                        if db_file_type in ["bed"]:
 4215                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4216
 4217                        for chrom in chomosomes_list:
 4218
 4219                            # Create BED on initial VCF
 4220                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4221                            tmp_bed = NamedTemporaryFile(
 4222                                prefix=self.get_prefix(),
 4223                                dir=self.get_tmp_dir(),
 4224                                suffix=".bed",
 4225                                delete=False,
 4226                            )
 4227                            tmp_bed_name = tmp_bed.name
 4228                            tmp_files.append(tmp_bed_name)
 4229
 4230                            # Detecte regions
 4231                            log.debug(
 4232                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4233                            )
 4234                            window = 1000000
 4235                            sql_query_intervals_for_bed = f"""
 4236                                SELECT  \"#CHROM\",
 4237                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4238                                        \"POS\"+{window}
 4239                                FROM {table_variants} as table_variants
 4240                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4241                            """
 4242                            regions = self.conn.execute(
 4243                                sql_query_intervals_for_bed
 4244                            ).fetchall()
 4245                            merged_regions = merge_regions(regions)
 4246                            log.debug(
 4247                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4248                            )
 4249
 4250                            header = ["#CHROM", "START", "END"]
 4251                            with open(tmp_bed_name, "w") as f:
 4252                                # Write the header with tab delimiter
 4253                                f.write("\t".join(header) + "\n")
 4254                                for d in merged_regions:
 4255                                    # Write each data row with tab delimiter
 4256                                    f.write("\t".join(map(str, d)) + "\n")
 4257
 4258                            # Tmp files
 4259                            tmp_annotation_vcf = NamedTemporaryFile(
 4260                                prefix=self.get_prefix(),
 4261                                dir=self.get_tmp_dir(),
 4262                                suffix=".vcf.gz",
 4263                                delete=False,
 4264                            )
 4265                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4266                            tmp_files.append(tmp_annotation_vcf_name)
 4267                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4268                            tmp_annotation_vcf_name_err = (
 4269                                tmp_annotation_vcf_name + ".err"
 4270                            )
 4271                            err_files.append(tmp_annotation_vcf_name_err)
 4272
 4273                            # Annotate Command
 4274                            log.debug(
 4275                                f"Annotation '{annotation}' - add bcftools command"
 4276                            )
 4277
 4278                            # Command
 4279                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4280
 4281                            # Add command
 4282                            commands.append(command_annotate)
 4283
 4284            # if some commands
 4285            if commands:
 4286
 4287                # Export VCF file
 4288                self.export_variant_vcf(
 4289                    vcf_file=tmp_vcf_name,
 4290                    remove_info=True,
 4291                    add_samples=False,
 4292                    index=True,
 4293                )
 4294
 4295                # Threads
 4296                # calculate threads for annotated commands
 4297                if commands:
 4298                    threads_bcftools_annotate = round(threads / len(commands))
 4299                else:
 4300                    threads_bcftools_annotate = 1
 4301
 4302                if not threads_bcftools_annotate:
 4303                    threads_bcftools_annotate = 1
 4304
 4305                # Add threads option to bcftools commands
 4306                if threads_bcftools_annotate > 1:
 4307                    commands_threaded = []
 4308                    for command in commands:
 4309                        commands_threaded.append(
 4310                            command.replace(
 4311                                f"{bcftools_bin_command} annotate ",
 4312                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4313                            )
 4314                        )
 4315                    commands = commands_threaded
 4316
 4317                # Command annotation multithreading
 4318                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4319                log.info(
 4320                    f"Annotation - Annotation multithreaded in "
 4321                    + str(len(commands))
 4322                    + " commands"
 4323                )
 4324
 4325                run_parallel_commands(commands, threads)
 4326
 4327                # Merge
 4328                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4329
 4330                if tmp_ann_vcf_list_cmd:
 4331
 4332                    # Tmp file
 4333                    tmp_annotate_vcf = NamedTemporaryFile(
 4334                        prefix=self.get_prefix(),
 4335                        dir=self.get_tmp_dir(),
 4336                        suffix=".vcf.gz",
 4337                        delete=True,
 4338                    )
 4339                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4340                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4341                    err_files.append(tmp_annotate_vcf_name_err)
 4342
 4343                    # Tmp file remove command
 4344                    tmp_files_remove_command = ""
 4345                    if tmp_files:
 4346                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4347
 4348                    # Command merge
 4349                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4350                    log.info(
 4351                        f"Annotation - Annotation merging "
 4352                        + str(len(commands))
 4353                        + " annotated files"
 4354                    )
 4355                    log.debug(f"Annotation - merge command: {merge_command}")
 4356                    run_parallel_commands([merge_command], 1)
 4357
 4358                    # Error messages
 4359                    log.info(f"Error/Warning messages:")
 4360                    error_message_command_all = []
 4361                    error_message_command_warning = []
 4362                    error_message_command_err = []
 4363                    for err_file in err_files:
 4364                        with open(err_file, "r") as f:
 4365                            for line in f:
 4366                                message = line.strip()
 4367                                error_message_command_all.append(message)
 4368                                if line.startswith("[W::"):
 4369                                    error_message_command_warning.append(message)
 4370                                if line.startswith("[E::"):
 4371                                    error_message_command_err.append(
 4372                                        f"{err_file}: " + message
 4373                                    )
 4374                    # log info
 4375                    for message in list(
 4376                        set(error_message_command_err + error_message_command_warning)
 4377                    ):
 4378                        log.info(f"   {message}")
 4379                    # debug info
 4380                    for message in list(set(error_message_command_all)):
 4381                        log.debug(f"   {message}")
 4382                    # failed
 4383                    if len(error_message_command_err):
 4384                        log.error("Annotation failed: Error in commands")
 4385                        raise ValueError("Annotation failed: Error in commands")
 4386
 4387                    # Update variants
 4388                    log.info(f"Annotation - Updating...")
 4389                    self.update_from_vcf(tmp_annotate_vcf_name)
 4390
 4391    def annotation_exomiser(self, threads: int = None) -> None:
 4392        """
 4393        This function annotate with Exomiser
 4394
 4395        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4396        - "analysis" (dict/file):
 4397            Full analysis dictionnary parameters (see Exomiser docs).
 4398            Either a dict, or a file in JSON or YAML format.
 4399            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4400            Default : None
 4401        - "preset" (string):
 4402            Analysis preset (available in config folder).
 4403            Used if no full "analysis" is provided.
 4404            Default: "exome"
 4405        - "phenopacket" (dict/file):
 4406            Samples and phenotipic features parameters (see Exomiser docs).
 4407            Either a dict, or a file in JSON or YAML format.
 4408            Default: None
 4409        - "subject" (dict):
 4410            Sample parameters (see Exomiser docs).
 4411            Example:
 4412                "subject":
 4413                    {
 4414                        "id": "ISDBM322017",
 4415                        "sex": "FEMALE"
 4416                    }
 4417            Default: None
 4418        - "sample" (string):
 4419            Sample name to construct "subject" section:
 4420                "subject":
 4421                    {
 4422                        "id": "<sample>",
 4423                        "sex": "UNKNOWN_SEX"
 4424                    }
 4425            Default: None
 4426        - "phenotypicFeatures" (dict)
 4427            Phenotypic features to construct "subject" section.
 4428            Example:
 4429                "phenotypicFeatures":
 4430                    [
 4431                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4432                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4433                    ]
 4434        - "hpo" (list)
 4435            List of HPO ids as phenotypic features.
 4436            Example:
 4437                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4438            Default: []
 4439        - "outputOptions" (dict):
 4440            Output options (see Exomiser docs).
 4441            Default:
 4442                "output_options" =
 4443                    {
 4444                        "outputContributingVariantsOnly": False,
 4445                        "numGenes": 0,
 4446                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4447                    }
 4448        - "transcript_source" (string):
 4449            Transcript source (either "refseq", "ucsc", "ensembl")
 4450            Default: "refseq"
 4451        - "exomiser_to_info" (boolean):
 4452            Add exomiser TSV file columns as INFO fields in VCF.
 4453            Default: False
 4454        - "release" (string):
 4455            Exomise database release.
 4456            If not exists, database release will be downloaded (take a while).
 4457            Default: None (provided by application.properties configuration file)
 4458        - "exomiser_application_properties" (file):
 4459            Exomiser configuration file (see Exomiser docs).
 4460            Useful to automatically download databases (especially for specific genome databases).
 4461
 4462        Notes:
 4463        - If no sample in parameters, first sample in VCF will be chosen
 4464        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4465
 4466        :param threads: The number of threads to use
 4467        :return: None.
 4468        """
 4469
 4470        # DEBUG
 4471        log.debug("Start annotation with Exomiser databases")
 4472
 4473        # Threads
 4474        if not threads:
 4475            threads = self.get_threads()
 4476        log.debug("Threads: " + str(threads))
 4477
 4478        # Config
 4479        config = self.get_config()
 4480        log.debug("Config: " + str(config))
 4481
 4482        # Config - Folders - Databases
 4483        databases_folders = (
 4484            config.get("folders", {})
 4485            .get("databases", {})
 4486            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4487        )
 4488        databases_folders = full_path(databases_folders)
 4489        if not os.path.exists(databases_folders):
 4490            log.error(f"Databases annotations: {databases_folders} NOT found")
 4491        log.debug("Databases annotations: " + str(databases_folders))
 4492
 4493        # Config - Exomiser
 4494        exomiser_bin_command = get_bin_command(
 4495            bin="exomiser-cli*.jar",
 4496            tool="exomiser",
 4497            bin_type="jar",
 4498            config=config,
 4499            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4500        )
 4501        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4502        if not exomiser_bin_command:
 4503            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4504            log.error(msg_err)
 4505            raise ValueError(msg_err)
 4506
 4507        # Param
 4508        param = self.get_param()
 4509        log.debug("Param: " + str(param))
 4510
 4511        # Param - Exomiser
 4512        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4513        log.debug(f"Param Exomiser: {param_exomiser}")
 4514
 4515        # Param - Assembly
 4516        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4517        log.debug("Assembly: " + str(assembly))
 4518
 4519        # Data
 4520        table_variants = self.get_table_variants()
 4521
 4522        # Check if not empty
 4523        log.debug("Check if not empty")
 4524        sql_query_chromosomes = (
 4525            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4526        )
 4527        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4528            log.info(f"VCF empty")
 4529            return False
 4530
 4531        # VCF header
 4532        vcf_reader = self.get_header()
 4533        log.debug("Initial header: " + str(vcf_reader.infos))
 4534
 4535        # Samples
 4536        samples = self.get_header_sample_list()
 4537        if not samples:
 4538            log.error("No Samples in VCF")
 4539            return False
 4540        log.debug(f"Samples: {samples}")
 4541
 4542        # Memory limit
 4543        memory_limit = self.get_memory("8G")
 4544        log.debug(f"memory_limit: {memory_limit}")
 4545
 4546        # Exomiser java options
 4547        exomiser_java_options = (
 4548            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4549        )
 4550        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4551
 4552        # Download Exomiser (if not exists)
 4553        exomiser_release = param_exomiser.get("release", None)
 4554        exomiser_application_properties = param_exomiser.get(
 4555            "exomiser_application_properties", None
 4556        )
 4557        databases_download_exomiser(
 4558            assemblies=[assembly],
 4559            exomiser_folder=databases_folders,
 4560            exomiser_release=exomiser_release,
 4561            exomiser_phenotype_release=exomiser_release,
 4562            exomiser_application_properties=exomiser_application_properties,
 4563        )
 4564
 4565        # Force annotation
 4566        force_update_annotation = True
 4567
 4568        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4569            log.debug("Start annotation Exomiser")
 4570
 4571            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4572
 4573                # tmp_dir = "/tmp/exomiser"
 4574
 4575                ### ANALYSIS ###
 4576                ################
 4577
 4578                # Create analysis.json through analysis dict
 4579                # either analysis in param or by default
 4580                # depending on preset exome/genome)
 4581
 4582                # Init analysis dict
 4583                param_exomiser_analysis_dict = {}
 4584
 4585                # analysis from param
 4586                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4587                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4588
 4589                # If analysis in param -> load anlaysis json
 4590                if param_exomiser_analysis:
 4591
 4592                    # If param analysis is a file and exists
 4593                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4594                        param_exomiser_analysis
 4595                    ):
 4596                        # Load analysis file into analysis dict (either yaml or json)
 4597                        with open(param_exomiser_analysis) as json_file:
 4598                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4599
 4600                    # If param analysis is a dict
 4601                    elif isinstance(param_exomiser_analysis, dict):
 4602                        # Load analysis dict into analysis dict (either yaml or json)
 4603                        param_exomiser_analysis_dict = param_exomiser_analysis
 4604
 4605                    # Error analysis type
 4606                    else:
 4607                        log.error(f"Analysis type unknown. Check param file.")
 4608                        raise ValueError(f"Analysis type unknown. Check param file.")
 4609
 4610                # Case no input analysis config file/dict
 4611                # Use preset (exome/genome) to open default config file
 4612                if not param_exomiser_analysis_dict:
 4613
 4614                    # default preset
 4615                    default_preset = "exome"
 4616
 4617                    # Get param preset or default preset
 4618                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4619
 4620                    # Try to find if preset is a file
 4621                    if os.path.exists(param_exomiser_preset):
 4622                        # Preset file is provided in full path
 4623                        param_exomiser_analysis_default_config_file = (
 4624                            param_exomiser_preset
 4625                        )
 4626                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4627                    #     # Preset file is provided in full path
 4628                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4629                    elif os.path.exists(
 4630                        os.path.join(folder_config, param_exomiser_preset)
 4631                    ):
 4632                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4633                        param_exomiser_analysis_default_config_file = os.path.join(
 4634                            folder_config, param_exomiser_preset
 4635                        )
 4636                    else:
 4637                        # Construct preset file
 4638                        param_exomiser_analysis_default_config_file = os.path.join(
 4639                            folder_config,
 4640                            f"preset-{param_exomiser_preset}-analysis.json",
 4641                        )
 4642
 4643                    # If preset file exists
 4644                    param_exomiser_analysis_default_config_file = full_path(
 4645                        param_exomiser_analysis_default_config_file
 4646                    )
 4647                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4648                        # Load prest file into analysis dict (either yaml or json)
 4649                        with open(
 4650                            param_exomiser_analysis_default_config_file
 4651                        ) as json_file:
 4652                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4653                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4654                                json_file
 4655                            )
 4656
 4657                    # Error preset file
 4658                    else:
 4659                        log.error(
 4660                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4661                        )
 4662                        raise ValueError(
 4663                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4664                        )
 4665
 4666                # If no analysis dict created
 4667                if not param_exomiser_analysis_dict:
 4668                    log.error(f"No analysis config")
 4669                    raise ValueError(f"No analysis config")
 4670
 4671                # Log
 4672                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4673
 4674                ### PHENOPACKET ###
 4675                ###################
 4676
 4677                # If no PhenoPacket in analysis dict -> check in param
 4678                if "phenopacket" not in param_exomiser_analysis_dict:
 4679
 4680                    # If PhenoPacket in param -> load anlaysis json
 4681                    if param_exomiser.get("phenopacket", None):
 4682
 4683                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4684                        param_exomiser_phenopacket = full_path(
 4685                            param_exomiser_phenopacket
 4686                        )
 4687
 4688                        # If param phenopacket is a file and exists
 4689                        if isinstance(
 4690                            param_exomiser_phenopacket, str
 4691                        ) and os.path.exists(param_exomiser_phenopacket):
 4692                            # Load phenopacket file into analysis dict (either yaml or json)
 4693                            with open(param_exomiser_phenopacket) as json_file:
 4694                                param_exomiser_analysis_dict["phenopacket"] = (
 4695                                    yaml.safe_load(json_file)
 4696                                )
 4697
 4698                        # If param phenopacket is a dict
 4699                        elif isinstance(param_exomiser_phenopacket, dict):
 4700                            # Load phenopacket dict into analysis dict (either yaml or json)
 4701                            param_exomiser_analysis_dict["phenopacket"] = (
 4702                                param_exomiser_phenopacket
 4703                            )
 4704
 4705                        # Error phenopacket type
 4706                        else:
 4707                            log.error(f"Phenopacket type unknown. Check param file.")
 4708                            raise ValueError(
 4709                                f"Phenopacket type unknown. Check param file."
 4710                            )
 4711
 4712                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4713                if "phenopacket" not in param_exomiser_analysis_dict:
 4714
 4715                    # Init PhenoPacket
 4716                    param_exomiser_analysis_dict["phenopacket"] = {
 4717                        "id": "analysis",
 4718                        "proband": {},
 4719                    }
 4720
 4721                    ### Add subject ###
 4722
 4723                    # If subject exists
 4724                    param_exomiser_subject = param_exomiser.get("subject", {})
 4725
 4726                    # If subject not exists -> found sample ID
 4727                    if not param_exomiser_subject:
 4728
 4729                        # Found sample ID in param
 4730                        sample = param_exomiser.get("sample", None)
 4731
 4732                        # Find sample ID (first sample)
 4733                        if not sample:
 4734                            sample_list = self.get_header_sample_list()
 4735                            if len(sample_list) > 0:
 4736                                sample = sample_list[0]
 4737                            else:
 4738                                log.error(f"No sample found")
 4739                                raise ValueError(f"No sample found")
 4740
 4741                        # Create subject
 4742                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4743
 4744                    # Add to dict
 4745                    param_exomiser_analysis_dict["phenopacket"][
 4746                        "subject"
 4747                    ] = param_exomiser_subject
 4748
 4749                    ### Add "phenotypicFeatures" ###
 4750
 4751                    # If phenotypicFeatures exists
 4752                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4753                        "phenotypicFeatures", []
 4754                    )
 4755
 4756                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4757                    if not param_exomiser_phenotypicfeatures:
 4758
 4759                        # Found HPO in param
 4760                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4761
 4762                        # Split HPO if list in string format separated by comma
 4763                        if isinstance(param_exomiser_hpo, str):
 4764                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4765
 4766                        # Create HPO list
 4767                        for hpo in param_exomiser_hpo:
 4768                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4769                            param_exomiser_phenotypicfeatures.append(
 4770                                {
 4771                                    "type": {
 4772                                        "id": f"HP:{hpo_clean}",
 4773                                        "label": f"HP:{hpo_clean}",
 4774                                    }
 4775                                }
 4776                            )
 4777
 4778                    # Add to dict
 4779                    param_exomiser_analysis_dict["phenopacket"][
 4780                        "phenotypicFeatures"
 4781                    ] = param_exomiser_phenotypicfeatures
 4782
 4783                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4784                    if not param_exomiser_phenotypicfeatures:
 4785                        for step in param_exomiser_analysis_dict.get(
 4786                            "analysis", {}
 4787                        ).get("steps", []):
 4788                            if "hiPhivePrioritiser" in step:
 4789                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4790                                    "steps", []
 4791                                ).remove(step)
 4792
 4793                ### Add Input File ###
 4794
 4795                # Initial file name and htsFiles
 4796                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4797                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4798                    {
 4799                        "uri": tmp_vcf_name,
 4800                        "htsFormat": "VCF",
 4801                        "genomeAssembly": assembly,
 4802                    }
 4803                ]
 4804
 4805                ### Add metaData ###
 4806
 4807                # If metaData not in analysis dict
 4808                if "metaData" not in param_exomiser_analysis_dict:
 4809                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4810                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4811                        "createdBy": "howard",
 4812                        "phenopacketSchemaVersion": 1,
 4813                    }
 4814
 4815                ### OutputOptions ###
 4816
 4817                # Init output result folder
 4818                output_results = os.path.join(tmp_dir, "results")
 4819
 4820                # If no outputOptions in analysis dict
 4821                if "outputOptions" not in param_exomiser_analysis_dict:
 4822
 4823                    # default output formats
 4824                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4825
 4826                    # Get outputOptions in param
 4827                    output_options = param_exomiser.get("outputOptions", None)
 4828
 4829                    # If no output_options in param -> check
 4830                    if not output_options:
 4831                        output_options = {
 4832                            "outputContributingVariantsOnly": False,
 4833                            "numGenes": 0,
 4834                            "outputFormats": defaut_output_formats,
 4835                        }
 4836
 4837                    # Replace outputDirectory in output options
 4838                    output_options["outputDirectory"] = output_results
 4839                    output_options["outputFileName"] = "howard"
 4840
 4841                    # Add outputOptions in analysis dict
 4842                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4843
 4844                else:
 4845
 4846                    # Replace output_results and output format (if exists in param)
 4847                    param_exomiser_analysis_dict["outputOptions"][
 4848                        "outputDirectory"
 4849                    ] = output_results
 4850                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4851                        list(
 4852                            set(
 4853                                param_exomiser_analysis_dict.get(
 4854                                    "outputOptions", {}
 4855                                ).get("outputFormats", [])
 4856                                + ["TSV_VARIANT", "VCF"]
 4857                            )
 4858                        )
 4859                    )
 4860
 4861                # log
 4862                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4863
 4864                ### ANALYSIS FILE ###
 4865                #####################
 4866
 4867                ### Full JSON analysis config file ###
 4868
 4869                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4870                with open(exomiser_analysis, "w") as fp:
 4871                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4872
 4873                ### SPLIT analysis and sample config files
 4874
 4875                # Splitted analysis dict
 4876                param_exomiser_analysis_dict_for_split = (
 4877                    param_exomiser_analysis_dict.copy()
 4878                )
 4879
 4880                # Phenopacket JSON file
 4881                exomiser_analysis_phenopacket = os.path.join(
 4882                    tmp_dir, "analysis_phenopacket.json"
 4883                )
 4884                with open(exomiser_analysis_phenopacket, "w") as fp:
 4885                    json.dump(
 4886                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4887                        fp,
 4888                        indent=4,
 4889                    )
 4890
 4891                # Analysis JSON file without Phenopacket parameters
 4892                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4893                exomiser_analysis_analysis = os.path.join(
 4894                    tmp_dir, "analysis_analysis.json"
 4895                )
 4896                with open(exomiser_analysis_analysis, "w") as fp:
 4897                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4898
 4899                ### INITAL VCF file ###
 4900                #######################
 4901
 4902                ### Create list of samples to use and include inti initial VCF file ####
 4903
 4904                # Subject (main sample)
 4905                # Get sample ID in analysis dict
 4906                sample_subject = (
 4907                    param_exomiser_analysis_dict.get("phenopacket", {})
 4908                    .get("subject", {})
 4909                    .get("id", None)
 4910                )
 4911                sample_proband = (
 4912                    param_exomiser_analysis_dict.get("phenopacket", {})
 4913                    .get("proband", {})
 4914                    .get("subject", {})
 4915                    .get("id", None)
 4916                )
 4917                sample = []
 4918                if sample_subject:
 4919                    sample.append(sample_subject)
 4920                if sample_proband:
 4921                    sample.append(sample_proband)
 4922
 4923                # Get sample ID within Pedigree
 4924                pedigree_persons_list = (
 4925                    param_exomiser_analysis_dict.get("phenopacket", {})
 4926                    .get("pedigree", {})
 4927                    .get("persons", {})
 4928                )
 4929
 4930                # Create list with all sample ID in pedigree (if exists)
 4931                pedigree_persons = []
 4932                for person in pedigree_persons_list:
 4933                    pedigree_persons.append(person.get("individualId"))
 4934
 4935                # Concat subject sample ID and samples ID in pedigreesamples
 4936                samples = list(set(sample + pedigree_persons))
 4937
 4938                # Check if sample list is not empty
 4939                if not samples:
 4940                    log.error(f"No samples found")
 4941                    raise ValueError(f"No samples found")
 4942
 4943                # Create VCF with sample (either sample in param or first one by default)
 4944                # Export VCF file
 4945                self.export_variant_vcf(
 4946                    vcf_file=tmp_vcf_name,
 4947                    remove_info=True,
 4948                    add_samples=True,
 4949                    list_samples=samples,
 4950                    index=False,
 4951                )
 4952
 4953                ### Execute Exomiser ###
 4954                ########################
 4955
 4956                # Init command
 4957                exomiser_command = ""
 4958
 4959                # Command exomiser options
 4960                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4961
 4962                # Release
 4963                exomiser_release = param_exomiser.get("release", None)
 4964                if exomiser_release:
 4965                    # phenotype data version
 4966                    exomiser_options += (
 4967                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4968                    )
 4969                    # data version
 4970                    exomiser_options += (
 4971                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4972                    )
 4973                    # variant white list
 4974                    variant_white_list_file = (
 4975                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4976                    )
 4977                    if os.path.exists(
 4978                        os.path.join(
 4979                            databases_folders, assembly, variant_white_list_file
 4980                        )
 4981                    ):
 4982                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4983
 4984                # transcript_source
 4985                transcript_source = param_exomiser.get(
 4986                    "transcript_source", None
 4987                )  # ucsc, refseq, ensembl
 4988                if transcript_source:
 4989                    exomiser_options += (
 4990                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4991                    )
 4992
 4993                # If analysis contain proband param
 4994                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4995                    "proband", {}
 4996                ):
 4997                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4998
 4999                # If no proband (usually uniq sample)
 5000                else:
 5001                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5002
 5003                # Log
 5004                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5005
 5006                # Run command
 5007                result = subprocess.call(
 5008                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5009                )
 5010                if result:
 5011                    log.error("Exomiser command failed")
 5012                    raise ValueError("Exomiser command failed")
 5013
 5014                ### RESULTS ###
 5015                ###############
 5016
 5017                ### Annotate with TSV fields ###
 5018
 5019                # Init result tsv file
 5020                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5021
 5022                # Init result tsv file
 5023                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5024
 5025                # Parse TSV file and explode columns in INFO field
 5026                if exomiser_to_info and os.path.exists(output_results_tsv):
 5027
 5028                    # Log
 5029                    log.debug("Exomiser columns to VCF INFO field")
 5030
 5031                    # Retrieve columns and types
 5032                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5033                    output_results_tsv_df = self.get_query_to_df(query)
 5034                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5035
 5036                    # Init concat fields for update
 5037                    sql_query_update_concat_fields = []
 5038
 5039                    # Fields to avoid
 5040                    fields_to_avoid = [
 5041                        "CONTIG",
 5042                        "START",
 5043                        "END",
 5044                        "REF",
 5045                        "ALT",
 5046                        "QUAL",
 5047                        "FILTER",
 5048                        "GENOTYPE",
 5049                    ]
 5050
 5051                    # List all columns to add into header
 5052                    for header_column in output_results_tsv_columns:
 5053
 5054                        # If header column is enable
 5055                        if header_column not in fields_to_avoid:
 5056
 5057                            # Header info type
 5058                            header_info_type = "String"
 5059                            header_column_df = output_results_tsv_df[header_column]
 5060                            header_column_df_dtype = header_column_df.dtype
 5061                            if header_column_df_dtype == object:
 5062                                if (
 5063                                    pd.to_numeric(header_column_df, errors="coerce")
 5064                                    .notnull()
 5065                                    .all()
 5066                                ):
 5067                                    header_info_type = "Float"
 5068                            else:
 5069                                header_info_type = "Integer"
 5070
 5071                            # Header info
 5072                            characters_to_validate = ["-"]
 5073                            pattern = "[" + "".join(characters_to_validate) + "]"
 5074                            header_info_name = re.sub(
 5075                                pattern,
 5076                                "_",
 5077                                f"Exomiser_{header_column}".replace("#", ""),
 5078                            )
 5079                            header_info_number = "."
 5080                            header_info_description = (
 5081                                f"Exomiser {header_column} annotation"
 5082                            )
 5083                            header_info_source = "Exomiser"
 5084                            header_info_version = "unknown"
 5085                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5086                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5087                                header_info_name,
 5088                                header_info_number,
 5089                                header_info_type,
 5090                                header_info_description,
 5091                                header_info_source,
 5092                                header_info_version,
 5093                                header_info_code,
 5094                            )
 5095
 5096                            # Add field to add for update to concat fields
 5097                            sql_query_update_concat_fields.append(
 5098                                f"""
 5099                                CASE
 5100                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5101                                    THEN concat(
 5102                                        '{header_info_name}=',
 5103                                        table_parquet."{header_column}",
 5104                                        ';'
 5105                                        )
 5106
 5107                                    ELSE ''
 5108                                END
 5109                            """
 5110                            )
 5111
 5112                    # Update query
 5113                    sql_query_update = f"""
 5114                        UPDATE {table_variants} as table_variants
 5115                            SET INFO = concat(
 5116                                            CASE
 5117                                                WHEN INFO NOT IN ('', '.')
 5118                                                THEN INFO
 5119                                                ELSE ''
 5120                                            END,
 5121                                            CASE
 5122                                                WHEN table_variants.INFO NOT IN ('','.')
 5123                                                THEN ';'
 5124                                                ELSE ''
 5125                                            END,
 5126                                            (
 5127                                            SELECT 
 5128                                                concat(
 5129                                                    {",".join(sql_query_update_concat_fields)}
 5130                                                )
 5131                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5132                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5133                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5134                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5135                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5136                                            )
 5137                                        )
 5138                            ;
 5139                        """
 5140
 5141                    # Update
 5142                    self.conn.execute(sql_query_update)
 5143
 5144                ### Annotate with VCF INFO field ###
 5145
 5146                # Init result VCF file
 5147                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5148
 5149                # If VCF exists
 5150                if os.path.exists(output_results_vcf):
 5151
 5152                    # Log
 5153                    log.debug("Exomiser result VCF update variants")
 5154
 5155                    # Find Exomiser INFO field annotation in header
 5156                    with gzip.open(output_results_vcf, "rt") as f:
 5157                        header_list = self.read_vcf_header(f)
 5158                    exomiser_vcf_header = vcf.Reader(
 5159                        io.StringIO("\n".join(header_list))
 5160                    )
 5161
 5162                    # Add annotation INFO field to header
 5163                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5164
 5165                    # Update variants with VCF
 5166                    self.update_from_vcf(output_results_vcf)
 5167
 5168        return True
 5169
 5170    def annotation_snpeff(self, threads: int = None) -> None:
 5171        """
 5172        This function annotate with snpEff
 5173
 5174        :param threads: The number of threads to use
 5175        :return: the value of the variable "return_value".
 5176        """
 5177
 5178        # DEBUG
 5179        log.debug("Start annotation with snpeff databases")
 5180
 5181        # Threads
 5182        if not threads:
 5183            threads = self.get_threads()
 5184        log.debug("Threads: " + str(threads))
 5185
 5186        # DEBUG
 5187        delete_tmp = True
 5188        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5189            delete_tmp = False
 5190            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5191
 5192        # Config
 5193        config = self.get_config()
 5194        log.debug("Config: " + str(config))
 5195
 5196        # Config - Folders - Databases
 5197        databases_folders = (
 5198            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5199        )
 5200        log.debug("Databases annotations: " + str(databases_folders))
 5201
 5202        # Config - snpEff bin command
 5203        snpeff_bin_command = get_bin_command(
 5204            bin="snpEff.jar",
 5205            tool="snpeff",
 5206            bin_type="jar",
 5207            config=config,
 5208            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5209        )
 5210        if not snpeff_bin_command:
 5211            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5212            log.error(msg_err)
 5213            raise ValueError(msg_err)
 5214
 5215        # Config - snpEff databases
 5216        snpeff_databases = (
 5217            config.get("folders", {})
 5218            .get("databases", {})
 5219            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5220        )
 5221        snpeff_databases = full_path(snpeff_databases)
 5222        if snpeff_databases is not None and snpeff_databases != "":
 5223            log.debug(f"Create snpEff databases folder")
 5224            if not os.path.exists(snpeff_databases):
 5225                os.makedirs(snpeff_databases)
 5226
 5227        # Param
 5228        param = self.get_param()
 5229        log.debug("Param: " + str(param))
 5230
 5231        # Param
 5232        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5233        log.debug("Options: " + str(options))
 5234
 5235        # Param - Assembly
 5236        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5237
 5238        # Param - Options
 5239        snpeff_options = (
 5240            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5241        )
 5242        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5243        snpeff_csvstats = (
 5244            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5245        )
 5246        if snpeff_stats:
 5247            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5248            snpeff_stats = full_path(snpeff_stats)
 5249            snpeff_options += f" -stats {snpeff_stats}"
 5250        if snpeff_csvstats:
 5251            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5252            snpeff_csvstats = full_path(snpeff_csvstats)
 5253            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5254
 5255        # Data
 5256        table_variants = self.get_table_variants()
 5257
 5258        # Check if not empty
 5259        log.debug("Check if not empty")
 5260        sql_query_chromosomes = (
 5261            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5262        )
 5263        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5264        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5265            log.info(f"VCF empty")
 5266            return
 5267
 5268        # Export in VCF
 5269        log.debug("Create initial file to annotate")
 5270        tmp_vcf = NamedTemporaryFile(
 5271            prefix=self.get_prefix(),
 5272            dir=self.get_tmp_dir(),
 5273            suffix=".vcf.gz",
 5274            delete=True,
 5275        )
 5276        tmp_vcf_name = tmp_vcf.name
 5277
 5278        # VCF header
 5279        vcf_reader = self.get_header()
 5280        log.debug("Initial header: " + str(vcf_reader.infos))
 5281
 5282        # Existing annotations
 5283        for vcf_annotation in self.get_header().infos:
 5284
 5285            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5286            log.debug(
 5287                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5288            )
 5289
 5290        # Memory limit
 5291        # if config.get("memory", None):
 5292        #     memory_limit = config.get("memory", "8G")
 5293        # else:
 5294        #     memory_limit = "8G"
 5295        memory_limit = self.get_memory("8G")
 5296        log.debug(f"memory_limit: {memory_limit}")
 5297
 5298        # snpEff java options
 5299        snpeff_java_options = (
 5300            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5301        )
 5302        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5303
 5304        force_update_annotation = True
 5305
 5306        if "ANN" not in self.get_header().infos or force_update_annotation:
 5307
 5308            # Check snpEff database
 5309            log.debug(f"Check snpEff databases {[assembly]}")
 5310            databases_download_snpeff(
 5311                folder=snpeff_databases, assemblies=[assembly], config=config
 5312            )
 5313
 5314            # Export VCF file
 5315            self.export_variant_vcf(
 5316                vcf_file=tmp_vcf_name,
 5317                remove_info=True,
 5318                add_samples=False,
 5319                index=True,
 5320            )
 5321
 5322            # Tmp file
 5323            err_files = []
 5324            tmp_annotate_vcf = NamedTemporaryFile(
 5325                prefix=self.get_prefix(),
 5326                dir=self.get_tmp_dir(),
 5327                suffix=".vcf",
 5328                delete=False,
 5329            )
 5330            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5331            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5332            err_files.append(tmp_annotate_vcf_name_err)
 5333
 5334            # Command
 5335            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5336            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5337            run_parallel_commands([snpeff_command], 1)
 5338
 5339            # Error messages
 5340            log.info(f"Error/Warning messages:")
 5341            error_message_command_all = []
 5342            error_message_command_warning = []
 5343            error_message_command_err = []
 5344            for err_file in err_files:
 5345                with open(err_file, "r") as f:
 5346                    for line in f:
 5347                        message = line.strip()
 5348                        error_message_command_all.append(message)
 5349                        if line.startswith("[W::"):
 5350                            error_message_command_warning.append(message)
 5351                        if line.startswith("[E::"):
 5352                            error_message_command_err.append(f"{err_file}: " + message)
 5353            # log info
 5354            for message in list(
 5355                set(error_message_command_err + error_message_command_warning)
 5356            ):
 5357                log.info(f"   {message}")
 5358            # debug info
 5359            for message in list(set(error_message_command_all)):
 5360                log.debug(f"   {message}")
 5361            # failed
 5362            if len(error_message_command_err):
 5363                log.error("Annotation failed: Error in commands")
 5364                raise ValueError("Annotation failed: Error in commands")
 5365
 5366            # Find annotation in header
 5367            with open(tmp_annotate_vcf_name, "rt") as f:
 5368                header_list = self.read_vcf_header(f)
 5369            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5370
 5371            for ann in annovar_vcf_header.infos:
 5372                if ann not in self.get_header().infos:
 5373                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5374
 5375            # Update variants
 5376            log.info(f"Annotation - Updating...")
 5377            self.update_from_vcf(tmp_annotate_vcf_name)
 5378
 5379        else:
 5380            if "ANN" in self.get_header().infos:
 5381                log.debug(f"Existing snpEff annotations in VCF")
 5382            if force_update_annotation:
 5383                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5384
 5385    def annotation_annovar(self, threads: int = None) -> None:
 5386        """
 5387        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5388        annotations
 5389
 5390        :param threads: number of threads to use
 5391        :return: the value of the variable "return_value".
 5392        """
 5393
 5394        # DEBUG
 5395        log.debug("Start annotation with Annovar databases")
 5396
 5397        # Threads
 5398        if not threads:
 5399            threads = self.get_threads()
 5400        log.debug("Threads: " + str(threads))
 5401
 5402        # Tmp en Err files
 5403        tmp_files = []
 5404        err_files = []
 5405
 5406        # DEBUG
 5407        delete_tmp = True
 5408        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5409            delete_tmp = False
 5410            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5411
 5412        # Config
 5413        config = self.get_config()
 5414        log.debug("Config: " + str(config))
 5415
 5416        # Config - Folders - Databases
 5417        databases_folders = (
 5418            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5419        )
 5420        log.debug("Databases annotations: " + str(databases_folders))
 5421
 5422        # Config - annovar bin command
 5423        annovar_bin_command = get_bin_command(
 5424            bin="table_annovar.pl",
 5425            tool="annovar",
 5426            bin_type="perl",
 5427            config=config,
 5428            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5429        )
 5430        if not annovar_bin_command:
 5431            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5432            log.error(msg_err)
 5433            raise ValueError(msg_err)
 5434
 5435        # Config - BCFTools bin command
 5436        bcftools_bin_command = get_bin_command(
 5437            bin="bcftools",
 5438            tool="bcftools",
 5439            bin_type="bin",
 5440            config=config,
 5441            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5442        )
 5443        if not bcftools_bin_command:
 5444            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5445            log.error(msg_err)
 5446            raise ValueError(msg_err)
 5447
 5448        # Config - annovar databases
 5449        annovar_databases = (
 5450            config.get("folders", {})
 5451            .get("databases", {})
 5452            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5453        )
 5454        if annovar_databases is not None:
 5455            if isinstance(annovar_databases, list):
 5456                annovar_databases = full_path(annovar_databases[0])
 5457                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5458            annovar_databases = full_path(annovar_databases)
 5459            if not os.path.exists(annovar_databases):
 5460                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5461                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5462        else:
 5463            msg_err = f"Annovar databases configuration failed"
 5464            log.error(msg_err)
 5465            raise ValueError(msg_err)
 5466
 5467        # Param
 5468        param = self.get_param()
 5469        log.debug("Param: " + str(param))
 5470
 5471        # Param - options
 5472        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5473        log.debug("Options: " + str(options))
 5474
 5475        # Param - annotations
 5476        annotations = (
 5477            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5478        )
 5479        log.debug("Annotations: " + str(annotations))
 5480
 5481        # Param - Assembly
 5482        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5483
 5484        # Annovar database assembly
 5485        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5486        if annovar_databases_assembly != "" and not os.path.exists(
 5487            annovar_databases_assembly
 5488        ):
 5489            os.makedirs(annovar_databases_assembly)
 5490
 5491        # Data
 5492        table_variants = self.get_table_variants()
 5493
 5494        # Check if not empty
 5495        log.debug("Check if not empty")
 5496        sql_query_chromosomes = (
 5497            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5498        )
 5499        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5500        if not sql_query_chromosomes_df["count"][0]:
 5501            log.info(f"VCF empty")
 5502            return
 5503
 5504        # VCF header
 5505        vcf_reader = self.get_header()
 5506        log.debug("Initial header: " + str(vcf_reader.infos))
 5507
 5508        # Existing annotations
 5509        for vcf_annotation in self.get_header().infos:
 5510
 5511            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5512            log.debug(
 5513                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5514            )
 5515
 5516        force_update_annotation = True
 5517
 5518        if annotations:
 5519
 5520            commands = []
 5521            tmp_annotates_vcf_name_list = []
 5522
 5523            # Export in VCF
 5524            log.debug("Create initial file to annotate")
 5525            tmp_vcf = NamedTemporaryFile(
 5526                prefix=self.get_prefix(),
 5527                dir=self.get_tmp_dir(),
 5528                suffix=".vcf.gz",
 5529                delete=False,
 5530            )
 5531            tmp_vcf_name = tmp_vcf.name
 5532            tmp_files.append(tmp_vcf_name)
 5533            tmp_files.append(tmp_vcf_name + ".tbi")
 5534
 5535            # Export VCF file
 5536            self.export_variant_vcf(
 5537                vcf_file=tmp_vcf_name,
 5538                remove_info=".",
 5539                add_samples=False,
 5540                index=True,
 5541            )
 5542
 5543            # Create file for field rename
 5544            log.debug("Create file for field rename")
 5545            tmp_rename = NamedTemporaryFile(
 5546                prefix=self.get_prefix(),
 5547                dir=self.get_tmp_dir(),
 5548                suffix=".rename",
 5549                delete=False,
 5550            )
 5551            tmp_rename_name = tmp_rename.name
 5552            tmp_files.append(tmp_rename_name)
 5553
 5554            # Check Annovar database
 5555            log.debug(
 5556                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5557            )
 5558            databases_download_annovar(
 5559                folder=annovar_databases,
 5560                files=list(annotations.keys()),
 5561                assemblies=[assembly],
 5562            )
 5563
 5564            for annotation in annotations:
 5565                annotation_fields = annotations[annotation]
 5566
 5567                if not annotation_fields:
 5568                    annotation_fields = {"INFO": None}
 5569
 5570                log.info(f"Annotations Annovar - database '{annotation}'")
 5571                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5572
 5573                # Tmp file for annovar
 5574                err_files = []
 5575                tmp_annotate_vcf_directory = TemporaryDirectory(
 5576                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5577                )
 5578                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5579                tmp_annotate_vcf_name_annovar = (
 5580                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5581                )
 5582                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5583                err_files.append(tmp_annotate_vcf_name_err)
 5584                tmp_files.append(tmp_annotate_vcf_name_err)
 5585
 5586                # Tmp file final vcf annotated by annovar
 5587                tmp_annotate_vcf = NamedTemporaryFile(
 5588                    prefix=self.get_prefix(),
 5589                    dir=self.get_tmp_dir(),
 5590                    suffix=".vcf.gz",
 5591                    delete=False,
 5592                )
 5593                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5594                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5595                tmp_files.append(tmp_annotate_vcf_name)
 5596                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5597
 5598                # Number of fields
 5599                annotation_list = []
 5600                annotation_renamed_list = []
 5601
 5602                for annotation_field in annotation_fields:
 5603
 5604                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5605                    annotation_fields_new_name = annotation_fields.get(
 5606                        annotation_field, annotation_field
 5607                    )
 5608                    if not annotation_fields_new_name:
 5609                        annotation_fields_new_name = annotation_field
 5610
 5611                    if (
 5612                        force_update_annotation
 5613                        or annotation_fields_new_name not in self.get_header().infos
 5614                    ):
 5615                        annotation_list.append(annotation_field)
 5616                        annotation_renamed_list.append(annotation_fields_new_name)
 5617                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5618                        log.warning(
 5619                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5620                        )
 5621
 5622                    # Add rename info
 5623                    run_parallel_commands(
 5624                        [
 5625                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5626                        ],
 5627                        1,
 5628                    )
 5629
 5630                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5631                log.debug("annotation_list: " + str(annotation_list))
 5632
 5633                # protocol
 5634                protocol = annotation
 5635
 5636                # argument
 5637                argument = ""
 5638
 5639                # operation
 5640                operation = "f"
 5641                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5642                    "ensGene"
 5643                ):
 5644                    operation = "g"
 5645                    if options.get("genebase", None):
 5646                        argument = f"""'{options.get("genebase","")}'"""
 5647                elif annotation in ["cytoBand"]:
 5648                    operation = "r"
 5649
 5650                # argument option
 5651                argument_option = ""
 5652                if argument != "":
 5653                    argument_option = " --argument " + argument
 5654
 5655                # command options
 5656                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5657                for option in options:
 5658                    if option not in ["genebase"]:
 5659                        command_options += f""" --{option}={options[option]}"""
 5660
 5661                # Command
 5662
 5663                # Command - Annovar
 5664                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5665                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5666
 5667                # Command - start pipe
 5668                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5669
 5670                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5671                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5672
 5673                # Command - Special characters (refGene annotation)
 5674                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5675
 5676                # Command - Clean empty fields (with value ".")
 5677                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5678
 5679                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5680                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5681                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5682                    # for ann in annotation_renamed_list:
 5683                    for ann in annotation_list:
 5684                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5685
 5686                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5687
 5688                # Command - indexing
 5689                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5690
 5691                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5692                run_parallel_commands([command_annovar], 1)
 5693
 5694                # Error messages
 5695                log.info(f"Error/Warning messages:")
 5696                error_message_command_all = []
 5697                error_message_command_warning = []
 5698                error_message_command_err = []
 5699                for err_file in err_files:
 5700                    with open(err_file, "r") as f:
 5701                        for line in f:
 5702                            message = line.strip()
 5703                            error_message_command_all.append(message)
 5704                            if line.startswith("[W::") or line.startswith("WARNING"):
 5705                                error_message_command_warning.append(message)
 5706                            if line.startswith("[E::") or line.startswith("ERROR"):
 5707                                error_message_command_err.append(
 5708                                    f"{err_file}: " + message
 5709                                )
 5710                # log info
 5711                for message in list(
 5712                    set(error_message_command_err + error_message_command_warning)
 5713                ):
 5714                    log.info(f"   {message}")
 5715                # debug info
 5716                for message in list(set(error_message_command_all)):
 5717                    log.debug(f"   {message}")
 5718                # failed
 5719                if len(error_message_command_err):
 5720                    log.error("Annotation failed: Error in commands")
 5721                    raise ValueError("Annotation failed: Error in commands")
 5722
 5723            if tmp_annotates_vcf_name_list:
 5724
 5725                # List of annotated files
 5726                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5727
 5728                # Tmp file
 5729                tmp_annotate_vcf = NamedTemporaryFile(
 5730                    prefix=self.get_prefix(),
 5731                    dir=self.get_tmp_dir(),
 5732                    suffix=".vcf.gz",
 5733                    delete=False,
 5734                )
 5735                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5736                tmp_files.append(tmp_annotate_vcf_name)
 5737                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5738                err_files.append(tmp_annotate_vcf_name_err)
 5739                tmp_files.append(tmp_annotate_vcf_name_err)
 5740
 5741                # Command merge
 5742                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5743                log.info(
 5744                    f"Annotation Annovar - Annotation merging "
 5745                    + str(len(tmp_annotates_vcf_name_list))
 5746                    + " annotated files"
 5747                )
 5748                log.debug(f"Annotation - merge command: {merge_command}")
 5749                run_parallel_commands([merge_command], 1)
 5750
 5751                # Find annotation in header
 5752                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5753                    header_list = self.read_vcf_header(f)
 5754                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5755
 5756                for ann in annovar_vcf_header.infos:
 5757                    if ann not in self.get_header().infos:
 5758                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5759
 5760                # Update variants
 5761                log.info(f"Annotation Annovar - Updating...")
 5762                self.update_from_vcf(tmp_annotate_vcf_name)
 5763
 5764            # Clean files
 5765            # Tmp file remove command
 5766            if True:
 5767                tmp_files_remove_command = ""
 5768                if tmp_files:
 5769                    tmp_files_remove_command = " ".join(tmp_files)
 5770                clean_command = f" rm -f {tmp_files_remove_command} "
 5771                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5772                log.debug(f"Annotation - cleaning command: {clean_command}")
 5773                run_parallel_commands([clean_command], 1)
 5774
 5775    # Parquet
 5776    def annotation_parquet(self, threads: int = None) -> None:
 5777        """
 5778        It takes a VCF file, and annotates it with a parquet file
 5779
 5780        :param threads: number of threads to use for the annotation
 5781        :return: the value of the variable "result".
 5782        """
 5783
 5784        # DEBUG
 5785        log.debug("Start annotation with parquet databases")
 5786
 5787        # Threads
 5788        if not threads:
 5789            threads = self.get_threads()
 5790        log.debug("Threads: " + str(threads))
 5791
 5792        # DEBUG
 5793        delete_tmp = True
 5794        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5795            delete_tmp = False
 5796            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5797
 5798        # Config
 5799        databases_folders = set(
 5800            self.get_config()
 5801            .get("folders", {})
 5802            .get("databases", {})
 5803            .get("annotations", ["."])
 5804            + self.get_config()
 5805            .get("folders", {})
 5806            .get("databases", {})
 5807            .get("parquet", ["."])
 5808        )
 5809        log.debug("Databases annotations: " + str(databases_folders))
 5810
 5811        # Param
 5812        annotations = (
 5813            self.get_param()
 5814            .get("annotation", {})
 5815            .get("parquet", {})
 5816            .get("annotations", None)
 5817        )
 5818        log.debug("Annotations: " + str(annotations))
 5819
 5820        # Assembly
 5821        assembly = self.get_param().get(
 5822            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5823        )
 5824
 5825        # Force Update Annotation
 5826        force_update_annotation = (
 5827            self.get_param()
 5828            .get("annotation", {})
 5829            .get("options", {})
 5830            .get("annotations_update", False)
 5831        )
 5832        log.debug(f"force_update_annotation={force_update_annotation}")
 5833        force_append_annotation = (
 5834            self.get_param()
 5835            .get("annotation", {})
 5836            .get("options", {})
 5837            .get("annotations_append", False)
 5838        )
 5839        log.debug(f"force_append_annotation={force_append_annotation}")
 5840
 5841        # Data
 5842        table_variants = self.get_table_variants()
 5843
 5844        # Check if not empty
 5845        log.debug("Check if not empty")
 5846        sql_query_chromosomes_df = self.get_query_to_df(
 5847            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5848        )
 5849        if not sql_query_chromosomes_df["count"][0]:
 5850            log.info(f"VCF empty")
 5851            return
 5852
 5853        # VCF header
 5854        vcf_reader = self.get_header()
 5855        log.debug("Initial header: " + str(vcf_reader.infos))
 5856
 5857        # Nb Variants POS
 5858        log.debug("NB Variants Start")
 5859        nb_variants = self.conn.execute(
 5860            f"SELECT count(*) AS count FROM variants"
 5861        ).fetchdf()["count"][0]
 5862        log.debug("NB Variants Stop")
 5863
 5864        # Existing annotations
 5865        for vcf_annotation in self.get_header().infos:
 5866
 5867            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5868            log.debug(
 5869                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5870            )
 5871
 5872        # Added columns
 5873        added_columns = []
 5874
 5875        # drop indexes
 5876        log.debug(f"Drop indexes...")
 5877        self.drop_indexes()
 5878
 5879        if annotations:
 5880
 5881            if "ALL" in annotations:
 5882
 5883                all_param = annotations.get("ALL", {})
 5884                all_param_formats = all_param.get("formats", None)
 5885                all_param_releases = all_param.get("releases", None)
 5886
 5887                databases_infos_dict = self.scan_databases(
 5888                    database_formats=all_param_formats,
 5889                    database_releases=all_param_releases,
 5890                )
 5891                for database_infos in databases_infos_dict.keys():
 5892                    if database_infos not in annotations:
 5893                        annotations[database_infos] = {"INFO": None}
 5894
 5895            for annotation in annotations:
 5896
 5897                if annotation in ["ALL"]:
 5898                    continue
 5899
 5900                # Annotation Name
 5901                annotation_name = os.path.basename(annotation)
 5902
 5903                # Annotation fields
 5904                annotation_fields = annotations[annotation]
 5905                if not annotation_fields:
 5906                    annotation_fields = {"INFO": None}
 5907
 5908                log.debug(f"Annotation '{annotation_name}'")
 5909                log.debug(
 5910                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5911                )
 5912
 5913                # Create Database
 5914                database = Database(
 5915                    database=annotation,
 5916                    databases_folders=databases_folders,
 5917                    assembly=assembly,
 5918                )
 5919
 5920                # Find files
 5921                parquet_file = database.get_database()
 5922                parquet_hdr_file = database.get_header_file()
 5923                parquet_type = database.get_type()
 5924
 5925                # Check if files exists
 5926                if not parquet_file or not parquet_hdr_file:
 5927                    msg_err_list = []
 5928                    if not parquet_file:
 5929                        msg_err_list.append(
 5930                            f"Annotation failed: Annotation file not found"
 5931                        )
 5932                    if parquet_file and not parquet_hdr_file:
 5933                        msg_err_list.append(
 5934                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 5935                        )
 5936
 5937                    log.error(". ".join(msg_err_list))
 5938                    raise ValueError(". ".join(msg_err_list))
 5939                else:
 5940                    # Get parquet connexion
 5941                    parquet_sql_attach = database.get_sql_database_attach(
 5942                        output="query"
 5943                    )
 5944                    if parquet_sql_attach:
 5945                        self.conn.execute(parquet_sql_attach)
 5946                    parquet_file_link = database.get_sql_database_link()
 5947                    # Log
 5948                    log.debug(
 5949                        f"Annotation '{annotation_name}' - file: "
 5950                        + str(parquet_file)
 5951                        + " and "
 5952                        + str(parquet_hdr_file)
 5953                    )
 5954
 5955                    # Database full header columns
 5956                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5957                        parquet_hdr_file
 5958                    )
 5959                    # Log
 5960                    log.debug(
 5961                        "Annotation database header columns : "
 5962                        + str(parquet_hdr_vcf_header_columns)
 5963                    )
 5964
 5965                    # Load header as VCF object
 5966                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5967                    # Log
 5968                    log.debug(
 5969                        "Annotation database header: "
 5970                        + str(parquet_hdr_vcf_header_infos)
 5971                    )
 5972
 5973                    # Get extra infos
 5974                    parquet_columns = database.get_extra_columns()
 5975                    # Log
 5976                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5977
 5978                    # Add extra columns if "ALL" in annotation_fields
 5979                    # if "ALL" in annotation_fields:
 5980                    #     allow_add_extra_column = True
 5981                    if "ALL" in annotation_fields and database.get_extra_columns():
 5982                        for extra_column in database.get_extra_columns():
 5983                            if (
 5984                                extra_column not in annotation_fields
 5985                                and extra_column.replace("INFO/", "")
 5986                                not in parquet_hdr_vcf_header_infos
 5987                            ):
 5988                                parquet_hdr_vcf_header_infos[extra_column] = (
 5989                                    vcf.parser._Info(
 5990                                        extra_column,
 5991                                        ".",
 5992                                        "String",
 5993                                        f"{extra_column} description",
 5994                                        "unknown",
 5995                                        "unknown",
 5996                                        self.code_type_map["String"],
 5997                                    )
 5998                                )
 5999
 6000                    # For all fields in database
 6001                    annotation_fields_all = False
 6002                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6003                        annotation_fields_all = True
 6004                        annotation_fields = {
 6005                            key: key for key in parquet_hdr_vcf_header_infos
 6006                        }
 6007
 6008                        log.debug(
 6009                            "Annotation database header - All annotations added: "
 6010                            + str(annotation_fields)
 6011                        )
 6012
 6013                    # Init
 6014
 6015                    # List of annotation fields to use
 6016                    sql_query_annotation_update_info_sets = []
 6017
 6018                    # List of annotation to agregate
 6019                    sql_query_annotation_to_agregate = []
 6020
 6021                    # Number of fields
 6022                    nb_annotation_field = 0
 6023
 6024                    # Annotation fields processed
 6025                    annotation_fields_processed = []
 6026
 6027                    # Columns mapping
 6028                    map_columns = database.map_columns(
 6029                        columns=annotation_fields, prefixes=["INFO/"]
 6030                    )
 6031
 6032                    # Query dict for fields to remove (update option)
 6033                    query_dict_remove = {}
 6034
 6035                    # Fetch Anotation fields
 6036                    for annotation_field in annotation_fields:
 6037
 6038                        # annotation_field_column
 6039                        annotation_field_column = map_columns.get(
 6040                            annotation_field, "INFO"
 6041                        )
 6042
 6043                        # field new name, if parametered
 6044                        annotation_fields_new_name = annotation_fields.get(
 6045                            annotation_field, annotation_field
 6046                        )
 6047                        if not annotation_fields_new_name:
 6048                            annotation_fields_new_name = annotation_field
 6049
 6050                        # To annotate
 6051                        # force_update_annotation = True
 6052                        # force_append_annotation = True
 6053                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6054                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6055                            force_update_annotation
 6056                            or force_append_annotation
 6057                            or (
 6058                                annotation_fields_new_name
 6059                                not in self.get_header().infos
 6060                            )
 6061                        ):
 6062
 6063                            # Add field to annotation to process list
 6064                            annotation_fields_processed.append(
 6065                                annotation_fields_new_name
 6066                            )
 6067
 6068                            # explode infos for the field
 6069                            annotation_fields_new_name_info_msg = ""
 6070                            if (
 6071                                force_update_annotation
 6072                                and annotation_fields_new_name
 6073                                in self.get_header().infos
 6074                            ):
 6075                                # Remove field from INFO
 6076                                query = f"""
 6077                                    UPDATE {table_variants} as table_variants
 6078                                    SET INFO = REGEXP_REPLACE(
 6079                                                concat(table_variants.INFO,''),
 6080                                                ';*{annotation_fields_new_name}=[^;]*',
 6081                                                ''
 6082                                                )
 6083                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6084                                """
 6085                                annotation_fields_new_name_info_msg = " [update]"
 6086                                query_dict_remove[
 6087                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6088                                ] = query
 6089
 6090                            # Sep between fields in INFO
 6091                            nb_annotation_field += 1
 6092                            if nb_annotation_field > 1:
 6093                                annotation_field_sep = ";"
 6094                            else:
 6095                                annotation_field_sep = ""
 6096
 6097                            log.info(
 6098                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6099                            )
 6100
 6101                            # Add INFO field to header
 6102                            parquet_hdr_vcf_header_infos_number = (
 6103                                parquet_hdr_vcf_header_infos[annotation_field].num
 6104                                or "."
 6105                            )
 6106                            parquet_hdr_vcf_header_infos_type = (
 6107                                parquet_hdr_vcf_header_infos[annotation_field].type
 6108                                or "String"
 6109                            )
 6110                            parquet_hdr_vcf_header_infos_description = (
 6111                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6112                                or f"{annotation_field} description"
 6113                            )
 6114                            parquet_hdr_vcf_header_infos_source = (
 6115                                parquet_hdr_vcf_header_infos[annotation_field].source
 6116                                or "unknown"
 6117                            )
 6118                            parquet_hdr_vcf_header_infos_version = (
 6119                                parquet_hdr_vcf_header_infos[annotation_field].version
 6120                                or "unknown"
 6121                            )
 6122
 6123                            vcf_reader.infos[annotation_fields_new_name] = (
 6124                                vcf.parser._Info(
 6125                                    annotation_fields_new_name,
 6126                                    parquet_hdr_vcf_header_infos_number,
 6127                                    parquet_hdr_vcf_header_infos_type,
 6128                                    parquet_hdr_vcf_header_infos_description,
 6129                                    parquet_hdr_vcf_header_infos_source,
 6130                                    parquet_hdr_vcf_header_infos_version,
 6131                                    self.code_type_map[
 6132                                        parquet_hdr_vcf_header_infos_type
 6133                                    ],
 6134                                )
 6135                            )
 6136
 6137                            # Append
 6138                            if force_append_annotation:
 6139                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6140                            else:
 6141                                query_case_when_append = ""
 6142
 6143                            # Annotation/Update query fields
 6144                            # Found in INFO column
 6145                            if (
 6146                                annotation_field_column == "INFO"
 6147                                and "INFO" in parquet_hdr_vcf_header_columns
 6148                            ):
 6149                                sql_query_annotation_update_info_sets.append(
 6150                                    f"""
 6151                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6152                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6153                                        ELSE ''
 6154                                    END
 6155                                """
 6156                                )
 6157                            # Found in a specific column
 6158                            else:
 6159                                sql_query_annotation_update_info_sets.append(
 6160                                    f"""
 6161                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6162                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6163                                        ELSE ''
 6164                                    END
 6165                                """
 6166                                )
 6167                                sql_query_annotation_to_agregate.append(
 6168                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6169                                )
 6170
 6171                        # Not to annotate
 6172                        else:
 6173
 6174                            if force_update_annotation:
 6175                                annotation_message = "forced"
 6176                            else:
 6177                                annotation_message = "skipped"
 6178
 6179                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6180                                log.warning(
 6181                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6182                                )
 6183                            if annotation_fields_new_name in self.get_header().infos:
 6184                                log.warning(
 6185                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6186                                )
 6187
 6188                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6189                    # allow_annotation_full_info = True
 6190                    allow_annotation_full_info = not force_append_annotation
 6191
 6192                    if parquet_type in ["regions"]:
 6193                        allow_annotation_full_info = False
 6194
 6195                    if (
 6196                        allow_annotation_full_info
 6197                        and nb_annotation_field == len(annotation_fields)
 6198                        and annotation_fields_all
 6199                        and (
 6200                            "INFO" in parquet_hdr_vcf_header_columns
 6201                            and "INFO" in database.get_extra_columns()
 6202                        )
 6203                    ):
 6204                        log.debug("Column INFO annotation enabled")
 6205                        sql_query_annotation_update_info_sets = []
 6206                        sql_query_annotation_update_info_sets.append(
 6207                            f" table_parquet.INFO "
 6208                        )
 6209
 6210                    if sql_query_annotation_update_info_sets:
 6211
 6212                        # Annotate
 6213                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6214
 6215                        # Join query annotation update info sets for SQL
 6216                        sql_query_annotation_update_info_sets_sql = ",".join(
 6217                            sql_query_annotation_update_info_sets
 6218                        )
 6219
 6220                        # Check chromosomes list (and variants infos)
 6221                        sql_query_chromosomes = f"""
 6222                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6223                            FROM {table_variants} as table_variants
 6224                            GROUP BY table_variants."#CHROM"
 6225                            ORDER BY table_variants."#CHROM"
 6226                            """
 6227                        sql_query_chromosomes_df = self.conn.execute(
 6228                            sql_query_chromosomes
 6229                        ).df()
 6230                        sql_query_chromosomes_dict = {
 6231                            entry["CHROM"]: {
 6232                                "count": entry["count_variants"],
 6233                                "min": entry["min_variants"],
 6234                                "max": entry["max_variants"],
 6235                            }
 6236                            for index, entry in sql_query_chromosomes_df.iterrows()
 6237                        }
 6238
 6239                        # Init
 6240                        nb_of_query = 0
 6241                        nb_of_variant_annotated = 0
 6242                        query_dict = query_dict_remove
 6243
 6244                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6245                        for chrom in sql_query_chromosomes_dict:
 6246
 6247                            # Number of variant by chromosome
 6248                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6249                                chrom, {}
 6250                            ).get("count", 0)
 6251
 6252                            log.debug(
 6253                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6254                            )
 6255
 6256                            # Annotation with regions database
 6257                            if parquet_type in ["regions"]:
 6258                                sql_query_annotation_from_clause = f"""
 6259                                    FROM (
 6260                                        SELECT 
 6261                                            '{chrom}' AS \"#CHROM\",
 6262                                            table_variants_from.\"POS\" AS \"POS\",
 6263                                            {",".join(sql_query_annotation_to_agregate)}
 6264                                        FROM {table_variants} as table_variants_from
 6265                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6266                                            table_parquet_from."#CHROM" = '{chrom}'
 6267                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6268                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 6269                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6270                                                )
 6271                                        )
 6272                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6273                                        GROUP BY table_variants_from.\"POS\"
 6274                                        )
 6275                                        as table_parquet
 6276                                """
 6277
 6278                                sql_query_annotation_where_clause = """
 6279                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6280                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6281                                """
 6282
 6283                            # Annotation with variants database
 6284                            else:
 6285                                sql_query_annotation_from_clause = f"""
 6286                                    FROM {parquet_file_link} as table_parquet
 6287                                """
 6288                                sql_query_annotation_where_clause = f"""
 6289                                    table_variants."#CHROM" = '{chrom}'
 6290                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6291                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6292                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6293                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6294                                """
 6295
 6296                            # Create update query
 6297                            sql_query_annotation_chrom_interval_pos = f"""
 6298                                UPDATE {table_variants} as table_variants
 6299                                    SET INFO = 
 6300                                        concat(
 6301                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6302                                                THEN table_variants.INFO
 6303                                                ELSE ''
 6304                                            END
 6305                                            ,
 6306                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6307                                                        AND (
 6308                                                        concat({sql_query_annotation_update_info_sets_sql})
 6309                                                        )
 6310                                                        NOT IN ('','.') 
 6311                                                    THEN ';'
 6312                                                    ELSE ''
 6313                                            END
 6314                                            ,
 6315                                            {sql_query_annotation_update_info_sets_sql}
 6316                                            )
 6317                                    {sql_query_annotation_from_clause}
 6318                                    WHERE {sql_query_annotation_where_clause}
 6319                                    ;
 6320                                """
 6321
 6322                            # Add update query to dict
 6323                            query_dict[
 6324                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6325                            ] = sql_query_annotation_chrom_interval_pos
 6326
 6327                        nb_of_query = len(query_dict)
 6328                        num_query = 0
 6329
 6330                        # SET max_expression_depth TO x
 6331                        self.conn.execute("SET max_expression_depth TO 10000")
 6332
 6333                        for query_name in query_dict:
 6334                            query = query_dict[query_name]
 6335                            num_query += 1
 6336                            log.info(
 6337                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6338                            )
 6339                            result = self.conn.execute(query)
 6340                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6341                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6342                            log.info(
 6343                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6344                            )
 6345
 6346                        log.info(
 6347                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6348                        )
 6349
 6350                    else:
 6351
 6352                        log.info(
 6353                            f"Annotation '{annotation_name}' - No Annotations available"
 6354                        )
 6355
 6356                    log.debug("Final header: " + str(vcf_reader.infos))
 6357
 6358        # Remove added columns
 6359        for added_column in added_columns:
 6360            self.drop_column(column=added_column)
 6361
 6362    def annotation_splice(self, threads: int = None) -> None:
 6363        """
 6364        This function annotate with snpEff
 6365
 6366        :param threads: The number of threads to use
 6367        :return: the value of the variable "return_value".
 6368        """
 6369
 6370        # DEBUG
 6371        log.debug("Start annotation with splice tools")
 6372
 6373        # Threads
 6374        if not threads:
 6375            threads = self.get_threads()
 6376        log.debug("Threads: " + str(threads))
 6377
 6378        # DEBUG
 6379        delete_tmp = True
 6380        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6381            delete_tmp = False
 6382            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6383
 6384        # Config
 6385        config = self.get_config()
 6386        log.debug("Config: " + str(config))
 6387        splice_config = config.get("tools", {}).get("splice", {})
 6388        if not splice_config:
 6389            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6390            msg_err = "No Splice tool config"
 6391            raise ValueError(msg_err)
 6392        log.debug(f"splice_config: {splice_config}")
 6393
 6394        # Config - Folders - Databases
 6395        databases_folders = (
 6396            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6397        )
 6398        log.debug("Databases annotations: " + str(databases_folders))
 6399
 6400        # Splice docker image
 6401        splice_docker_image = splice_config.get("docker").get("image")
 6402
 6403        # Pull splice image if it's not already there
 6404        if not check_docker_image_exists(splice_docker_image):
 6405            log.warning(
 6406                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6407            )
 6408            try:
 6409                command(f"docker pull {splice_config.get('docker').get('image')}")
 6410            except subprocess.CalledProcessError:
 6411                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6412                log.error(msg_err)
 6413                raise ValueError(msg_err)
 6414
 6415        # Config - splice databases
 6416        splice_databases = (
 6417            config.get("folders", {})
 6418            .get("databases", {})
 6419            .get("splice", DEFAULT_SPLICE_FOLDER)
 6420        )
 6421        splice_databases = full_path(splice_databases)
 6422
 6423        # Param
 6424        param = self.get_param()
 6425        log.debug("Param: " + str(param))
 6426
 6427        # Param
 6428        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6429        log.debug("Options: " + str(options))
 6430
 6431        # Data
 6432        table_variants = self.get_table_variants()
 6433
 6434        # Check if not empty
 6435        log.debug("Check if not empty")
 6436        sql_query_chromosomes = (
 6437            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6438        )
 6439        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6440            log.info("VCF empty")
 6441            return None
 6442
 6443        # Export in VCF
 6444        log.debug("Create initial file to annotate")
 6445
 6446        # Create output folder / work folder
 6447        if options.get("output_folder", ""):
 6448            output_folder = options.get("output_folder", "")
 6449            if not os.path.exists(output_folder):
 6450                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6451        else:
 6452            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6453            if not os.path.exists(output_folder):
 6454                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6455
 6456        if options.get("workdir", ""):
 6457            workdir = options.get("workdir", "")
 6458        else:
 6459            workdir = "/work"
 6460
 6461        # Create tmp VCF file
 6462        tmp_vcf = NamedTemporaryFile(
 6463            prefix=self.get_prefix(),
 6464            dir=output_folder,
 6465            suffix=".vcf",
 6466            delete=False,
 6467        )
 6468        tmp_vcf_name = tmp_vcf.name
 6469
 6470        # VCF header
 6471        header = self.get_header()
 6472
 6473        # Existing annotations
 6474        for vcf_annotation in self.get_header().infos:
 6475
 6476            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6477            log.debug(
 6478                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6479            )
 6480
 6481        # Memory limit
 6482        if config.get("memory", None):
 6483            memory_limit = config.get("memory", "8G").upper()
 6484            # upper()
 6485        else:
 6486            memory_limit = "8G"
 6487        log.debug(f"memory_limit: {memory_limit}")
 6488
 6489        # Check number of variants to annotate
 6490        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6491        where_clause_regex_spip = r"SPiP_\w+"
 6492        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6493        df_list_of_variants_to_annotate = self.get_query_to_df(
 6494            query=f""" SELECT * FROM variants {where_clause} """
 6495        )
 6496        if len(df_list_of_variants_to_annotate) == 0:
 6497            log.warning(
 6498                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6499            )
 6500            return None
 6501        else:
 6502            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6503
 6504        # Export VCF file
 6505        self.export_variant_vcf(
 6506            vcf_file=tmp_vcf_name,
 6507            remove_info=True,
 6508            add_samples=True,
 6509            index=False,
 6510            where_clause=where_clause,
 6511        )
 6512        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6513        if any(value for value in splice_config.values() if value is None):
 6514            log.warning("At least one splice config parameter is empty")
 6515            # exit annotation_splice
 6516            return None
 6517
 6518        # Params in splice nf
 6519        def check_values(dico: dict):
 6520            """
 6521            Ensure parameters for NF splice pipeline
 6522            """
 6523            for key, val in dico.items():
 6524                if key == "genome":
 6525                    if any(
 6526                        assemb in options.get("genome", {})
 6527                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6528                    ):
 6529                        yield f"--{key} hg19"
 6530                    elif any(
 6531                        assemb in options.get("genome", {})
 6532                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6533                    ):
 6534                        yield f"--{key} hg38"
 6535                elif (
 6536                    (isinstance(val, str) and val)
 6537                    or isinstance(val, int)
 6538                    or isinstance(val, bool)
 6539                ):
 6540                    yield f"--{key} {val}"
 6541
 6542        # Genome
 6543        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6544        options["genome"] = genome
 6545        # NF params
 6546        nf_params = []
 6547        # Add options
 6548        if options:
 6549            log.debug(options)
 6550            nf_params = list(check_values(options))
 6551            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6552        else:
 6553            log.debug("No NF params provided")
 6554        # Add threads
 6555        if "threads" not in options.keys():
 6556            nf_params.append(f"--threads {threads}")
 6557        # Genome path
 6558        genome_path = find_genome(
 6559            config.get("folders", {})
 6560            .get("databases", {})
 6561            .get("genomes", DEFAULT_GENOME_FOLDER),
 6562            file=f"{genome}.fa",
 6563        )
 6564        # Add genome path
 6565        if not genome_path:
 6566            raise ValueError(
 6567                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6568            )
 6569        else:
 6570            log.debug(f"Genome: {genome_path}")
 6571            nf_params.append(f"--genome_path {genome_path}")
 6572
 6573        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6574            """
 6575            Setting up updated databases for SPiP and SpliceAI
 6576            """
 6577
 6578            try:
 6579
 6580                # SpliceAI assembly transcriptome
 6581                spliceai_assembly = os.path.join(
 6582                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6583                    options.get("genome"),
 6584                    "transcriptome",
 6585                )
 6586                spip_assembly = options.get("genome")
 6587
 6588                spip = find(
 6589                    f"transcriptome_{spip_assembly}.RData",
 6590                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6591                )
 6592                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6593                log.debug(f"SPiP annotations: {spip}")
 6594                log.debug(f"SpliceAI annotations: {spliceai}")
 6595                if spip and spliceai:
 6596                    return [
 6597                        f"--spip_transcriptome {spip}",
 6598                        f"--spliceai_transcriptome {spliceai}",
 6599                    ]
 6600                else:
 6601                    log.warning(
 6602                        "Can't find splice databases in configuration, use annotations file from image"
 6603                    )
 6604            except TypeError:
 6605                log.warning(
 6606                    "Can't find splice databases in configuration, use annotations file from image"
 6607                )
 6608                return []
 6609
 6610        # Add options, check if transcriptome option have already beend provided
 6611        if (
 6612            "spip_transcriptome" not in nf_params
 6613            and "spliceai_transcriptome" not in nf_params
 6614        ):
 6615            splice_reference = splice_annotations(options, config)
 6616            if splice_reference:
 6617                nf_params.extend(splice_reference)
 6618        # nf_params.append(f"--output_folder {output_folder}")
 6619        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6620        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6621        log.debug(cmd)
 6622        splice_config["docker"]["command"] = cmd
 6623
 6624        # Ensure proxy is set
 6625        proxy = [
 6626            f"-e {var}={os.getenv(var)}"
 6627            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6628            if os.getenv(var) is not None
 6629        ]
 6630        docker_cmd = get_bin_command(
 6631            tool="splice",
 6632            bin_type="docker",
 6633            config=config,
 6634            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6635            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6636        )
 6637        # print(docker_cmd)
 6638        # exit()
 6639        # Docker debug
 6640        # if splice_config.get("rm_container"):
 6641        #     rm_container = "--rm"
 6642        # else:
 6643        #     rm_container = ""
 6644        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6645        log.debug(docker_cmd)
 6646        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6647        log.debug(res.stdout)
 6648        if res.stderr:
 6649            log.error(res.stderr)
 6650        res.check_returncode()
 6651        # Update variants
 6652        log.info("Annotation - Updating...")
 6653        # Test find output vcf
 6654        log.debug(
 6655            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6656        )
 6657        output_vcf = []
 6658        # Wrong folder to look in
 6659        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6660            if (
 6661                files
 6662                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6663            ):
 6664                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6665        # log.debug(os.listdir(options.get("output_folder")))
 6666        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6667        if not output_vcf:
 6668            log.debug(
 6669                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6670            )
 6671        else:
 6672            # Get new header from annotated vcf
 6673            log.debug(f"Initial header: {len(header.infos)} fields")
 6674            # Create new header with splice infos
 6675            new_vcf = Variants(input=output_vcf[0])
 6676            new_vcf_header = new_vcf.get_header().infos
 6677            for keys, infos in new_vcf_header.items():
 6678                if keys not in header.infos.keys():
 6679                    header.infos[keys] = infos
 6680            log.debug(f"New header: {len(header.infos)} fields")
 6681            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6682            self.update_from_vcf(output_vcf[0])
 6683
 6684        # Remove file
 6685        remove_if_exists(output_vcf)
 6686
 6687    ###
 6688    # Prioritization
 6689    ###
 6690
 6691    def get_config_default(self, name: str) -> dict:
 6692        """
 6693        The function `get_config_default` returns a dictionary containing default configurations for
 6694        various calculations and prioritizations.
 6695
 6696        :param name: The `get_config_default` function returns a dictionary containing default
 6697        configurations for different calculations and prioritizations. The `name` parameter is used to
 6698        specify which specific configuration to retrieve from the dictionary
 6699        :type name: str
 6700        :return: The function `get_config_default` returns a dictionary containing default configuration
 6701        settings for different calculations and prioritizations. The specific configuration settings are
 6702        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6703        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6704        returned. If there is no match, an empty dictionary is returned.
 6705        """
 6706
 6707        config_default = {
 6708            "calculations": {
 6709                "variant_chr_pos_alt_ref": {
 6710                    "type": "sql",
 6711                    "name": "variant_chr_pos_alt_ref",
 6712                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6713                    "available": False,
 6714                    "output_column_name": "variant_chr_pos_alt_ref",
 6715                    "output_column_type": "String",
 6716                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6717                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6718                    "operation_info": True,
 6719                },
 6720                "VARTYPE": {
 6721                    "type": "sql",
 6722                    "name": "VARTYPE",
 6723                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6724                    "available": True,
 6725                    "output_column_name": "VARTYPE",
 6726                    "output_column_type": "String",
 6727                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6728                    "operation_query": """
 6729                            CASE
 6730                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6731                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6732                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6733                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6734                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6735                                ELSE 'UNDEFINED'
 6736                            END
 6737                            """,
 6738                    "info_fields": ["SVTYPE"],
 6739                    "operation_info": True,
 6740                },
 6741                "snpeff_hgvs": {
 6742                    "type": "python",
 6743                    "name": "snpeff_hgvs",
 6744                    "description": "HGVS nomenclatures from snpEff annotation",
 6745                    "available": True,
 6746                    "function_name": "calculation_extract_snpeff_hgvs",
 6747                    "function_params": ["snpeff_hgvs", "ANN"],
 6748                },
 6749                "snpeff_ann_explode": {
 6750                    "type": "python",
 6751                    "name": "snpeff_ann_explode",
 6752                    "description": "Explode snpEff annotations with uniquify values",
 6753                    "available": True,
 6754                    "function_name": "calculation_snpeff_ann_explode",
 6755                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6756                },
 6757                "snpeff_ann_explode_uniquify": {
 6758                    "type": "python",
 6759                    "name": "snpeff_ann_explode_uniquify",
 6760                    "description": "Explode snpEff annotations",
 6761                    "available": True,
 6762                    "function_name": "calculation_snpeff_ann_explode",
 6763                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6764                },
 6765                "snpeff_ann_explode_json": {
 6766                    "type": "python",
 6767                    "name": "snpeff_ann_explode_json",
 6768                    "description": "Explode snpEff annotations in JSON format",
 6769                    "available": True,
 6770                    "function_name": "calculation_snpeff_ann_explode",
 6771                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6772                },
 6773                "NOMEN": {
 6774                    "type": "python",
 6775                    "name": "NOMEN",
 6776                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6777                    "available": True,
 6778                    "function_name": "calculation_extract_nomen",
 6779                    "function_params": [],
 6780                },
 6781                "FINDBYPIPELINE": {
 6782                    "type": "python",
 6783                    "name": "FINDBYPIPELINE",
 6784                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6785                    "available": True,
 6786                    "function_name": "calculation_find_by_pipeline",
 6787                    "function_params": ["findbypipeline"],
 6788                },
 6789                "FINDBYSAMPLE": {
 6790                    "type": "python",
 6791                    "name": "FINDBYSAMPLE",
 6792                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6793                    "available": True,
 6794                    "function_name": "calculation_find_by_pipeline",
 6795                    "function_params": ["findbysample"],
 6796                },
 6797                "GENOTYPECONCORDANCE": {
 6798                    "type": "python",
 6799                    "name": "GENOTYPECONCORDANCE",
 6800                    "description": "Concordance of genotype for multi caller VCF",
 6801                    "available": True,
 6802                    "function_name": "calculation_genotype_concordance",
 6803                    "function_params": [],
 6804                },
 6805                "BARCODE": {
 6806                    "type": "python",
 6807                    "name": "BARCODE",
 6808                    "description": "BARCODE as VaRank tool",
 6809                    "available": True,
 6810                    "function_name": "calculation_barcode",
 6811                    "function_params": [],
 6812                },
 6813                "BARCODEFAMILY": {
 6814                    "type": "python",
 6815                    "name": "BARCODEFAMILY",
 6816                    "description": "BARCODEFAMILY as VaRank tool",
 6817                    "available": True,
 6818                    "function_name": "calculation_barcode_family",
 6819                    "function_params": ["BCF"],
 6820                },
 6821                "TRIO": {
 6822                    "type": "python",
 6823                    "name": "TRIO",
 6824                    "description": "Inheritance for a trio family",
 6825                    "available": True,
 6826                    "function_name": "calculation_trio",
 6827                    "function_params": [],
 6828                },
 6829                "VAF": {
 6830                    "type": "python",
 6831                    "name": "VAF",
 6832                    "description": "Variant Allele Frequency (VAF) harmonization",
 6833                    "available": True,
 6834                    "function_name": "calculation_vaf_normalization",
 6835                    "function_params": [],
 6836                },
 6837                "VAF_stats": {
 6838                    "type": "python",
 6839                    "name": "VAF_stats",
 6840                    "description": "Variant Allele Frequency (VAF) statistics",
 6841                    "available": True,
 6842                    "function_name": "calculation_genotype_stats",
 6843                    "function_params": ["VAF"],
 6844                },
 6845                "DP_stats": {
 6846                    "type": "python",
 6847                    "name": "DP_stats",
 6848                    "description": "Depth (DP) statistics",
 6849                    "available": True,
 6850                    "function_name": "calculation_genotype_stats",
 6851                    "function_params": ["DP"],
 6852                },
 6853                "variant_id": {
 6854                    "type": "python",
 6855                    "name": "variant_id",
 6856                    "description": "Variant ID generated from variant position and type",
 6857                    "available": True,
 6858                    "function_name": "calculation_variant_id",
 6859                    "function_params": [],
 6860                },
 6861                "transcripts_json": {
 6862                    "type": "python",
 6863                    "name": "transcripts_json",
 6864                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6865                    "available": True,
 6866                    "function_name": "calculation_transcripts_annotation",
 6867                    "function_params": ["transcripts_json", None],
 6868                },
 6869                "transcripts_ann": {
 6870                    "type": "python",
 6871                    "name": "transcripts_ann",
 6872                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6873                    "available": True,
 6874                    "function_name": "calculation_transcripts_annotation",
 6875                    "function_params": [None, "transcripts_ann"],
 6876                },
 6877                "transcripts_annotations": {
 6878                    "type": "python",
 6879                    "name": "transcripts_annotations",
 6880                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6881                    "available": True,
 6882                    "function_name": "calculation_transcripts_annotation",
 6883                    "function_params": [None, None],
 6884                },
 6885                "transcripts_prioritization": {
 6886                    "type": "python",
 6887                    "name": "transcripts_prioritization",
 6888                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6889                    "available": True,
 6890                    "function_name": "calculation_transcripts_prioritization",
 6891                    "function_params": [],
 6892                },
 6893                "transcripts_export": {
 6894                    "type": "python",
 6895                    "name": "transcripts_export",
 6896                    "description": "Export transcripts table/view as a file (using param.json)",
 6897                    "available": True,
 6898                    "function_name": "calculation_transcripts_export",
 6899                    "function_params": [],
 6900                },
 6901            },
 6902            "prioritizations": {
 6903                "default": {
 6904                    "ANN2": [
 6905                        {
 6906                            "type": "contains",
 6907                            "value": "HIGH",
 6908                            "score": 5,
 6909                            "flag": "PASS",
 6910                            "comment": [
 6911                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6912                            ],
 6913                        },
 6914                        {
 6915                            "type": "contains",
 6916                            "value": "MODERATE",
 6917                            "score": 3,
 6918                            "flag": "PASS",
 6919                            "comment": [
 6920                                "A non-disruptive variant that might change protein effectiveness"
 6921                            ],
 6922                        },
 6923                        {
 6924                            "type": "contains",
 6925                            "value": "LOW",
 6926                            "score": 0,
 6927                            "flag": "FILTERED",
 6928                            "comment": [
 6929                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6930                            ],
 6931                        },
 6932                        {
 6933                            "type": "contains",
 6934                            "value": "MODIFIER",
 6935                            "score": 0,
 6936                            "flag": "FILTERED",
 6937                            "comment": [
 6938                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6939                            ],
 6940                        },
 6941                    ],
 6942                }
 6943            },
 6944        }
 6945
 6946        return config_default.get(name, None)
 6947
 6948    def get_config_json(
 6949        self, name: str, config_dict: dict = {}, config_file: str = None
 6950    ) -> dict:
 6951        """
 6952        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6953        default values, a dictionary, and a file.
 6954
 6955        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6956        the name of the configuration. It is used to identify and retrieve the configuration settings
 6957        for a specific component or module
 6958        :type name: str
 6959        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6960        dictionary that allows you to provide additional configuration settings or overrides. When you
 6961        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6962        the key is the configuration setting you want to override or
 6963        :type config_dict: dict
 6964        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6965        specify the path to a configuration file that contains additional settings. If provided, the
 6966        function will read the contents of this file and update the configuration dictionary with the
 6967        values found in the file, overriding any existing values with the
 6968        :type config_file: str
 6969        :return: The function `get_config_json` returns a dictionary containing the configuration
 6970        settings.
 6971        """
 6972
 6973        # Create with default prioritizations
 6974        config_default = self.get_config_default(name=name)
 6975        configuration = config_default
 6976        # log.debug(f"configuration={configuration}")
 6977
 6978        # Replace prioritizations from dict
 6979        for config in config_dict:
 6980            configuration[config] = config_dict[config]
 6981
 6982        # Replace prioritizations from file
 6983        config_file = full_path(config_file)
 6984        if config_file:
 6985            if os.path.exists(config_file):
 6986                with open(config_file) as config_file_content:
 6987                    config_file_dict = json.load(config_file_content)
 6988                for config in config_file_dict:
 6989                    configuration[config] = config_file_dict[config]
 6990            else:
 6991                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6992                log.error(msg_error)
 6993                raise ValueError(msg_error)
 6994
 6995        return configuration
 6996
 6997    def prioritization(
 6998        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 6999    ) -> bool:
 7000        """
 7001        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7002        prioritizes variants based on configured profiles and criteria.
 7003
 7004        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7005        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7006        a table name is provided, the method will prioritize the variants in that specific table
 7007        :type table: str
 7008        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7009        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7010        provided, the code will use a default prefix value of "PZ"
 7011        :type pz_prefix: str
 7012        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7013        additional parameters specific to the prioritization process. These parameters can include
 7014        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7015        configurations needed for the prioritization of variants in a V
 7016        :type pz_param: dict
 7017        :return: A boolean value (True) is being returned from the `prioritization` function.
 7018        """
 7019
 7020        # Config
 7021        config = self.get_config()
 7022
 7023        # Param
 7024        param = self.get_param()
 7025
 7026        # Prioritization param
 7027        if pz_param is not None:
 7028            prioritization_param = pz_param
 7029        else:
 7030            prioritization_param = param.get("prioritization", {})
 7031
 7032        # Configuration profiles
 7033        prioritization_config_file = prioritization_param.get(
 7034            "prioritization_config", None
 7035        )
 7036        prioritization_config_file = full_path(prioritization_config_file)
 7037        prioritizations_config = self.get_config_json(
 7038            name="prioritizations", config_file=prioritization_config_file
 7039        )
 7040
 7041        # Prioritization prefix
 7042        pz_prefix_default = "PZ"
 7043        if pz_prefix is None:
 7044            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7045
 7046        # Prioritization options
 7047        profiles = prioritization_param.get("profiles", [])
 7048        if isinstance(profiles, str):
 7049            profiles = profiles.split(",")
 7050        pzfields = prioritization_param.get(
 7051            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7052        )
 7053        if isinstance(pzfields, str):
 7054            pzfields = pzfields.split(",")
 7055        default_profile = prioritization_param.get("default_profile", None)
 7056        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7057        prioritization_score_mode = prioritization_param.get(
 7058            "prioritization_score_mode", "HOWARD"
 7059        )
 7060
 7061        # Quick Prioritizations
 7062        prioritizations = param.get("prioritizations", None)
 7063        if prioritizations:
 7064            log.info("Quick Prioritization:")
 7065            for profile in prioritizations.split(","):
 7066                if profile not in profiles:
 7067                    profiles.append(profile)
 7068                    log.info(f"   {profile}")
 7069
 7070        # If profile "ALL" provided, all profiles in the config profiles
 7071        if "ALL" in profiles:
 7072            profiles = list(prioritizations_config.keys())
 7073
 7074        for profile in profiles:
 7075            if prioritizations_config.get(profile, None):
 7076                log.debug(f"Profile '{profile}' configured")
 7077            else:
 7078                msg_error = f"Profile '{profile}' NOT configured"
 7079                log.error(msg_error)
 7080                raise ValueError(msg_error)
 7081
 7082        if profiles:
 7083            log.info(f"Prioritization... ")
 7084        else:
 7085            log.debug(f"No profile defined")
 7086            return False
 7087
 7088        if not default_profile and len(profiles):
 7089            default_profile = profiles[0]
 7090
 7091        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7092        log.debug("Profiles to check: " + str(list(profiles)))
 7093
 7094        # Variables
 7095        if table is not None:
 7096            table_variants = table
 7097        else:
 7098            table_variants = self.get_table_variants(clause="update")
 7099        log.debug(f"Table to prioritize: {table_variants}")
 7100
 7101        # Added columns
 7102        added_columns = []
 7103
 7104        # Create list of PZfields
 7105        # List of PZFields
 7106        list_of_pzfields_original = pzfields + [
 7107            pzfield + pzfields_sep + profile
 7108            for pzfield in pzfields
 7109            for profile in profiles
 7110        ]
 7111        list_of_pzfields = []
 7112        log.debug(f"{list_of_pzfields_original}")
 7113
 7114        # Remove existing PZfields to use if exists
 7115        for pzfield in list_of_pzfields_original:
 7116            if self.get_header().infos.get(pzfield, None) is None:
 7117                list_of_pzfields.append(pzfield)
 7118                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7119            else:
 7120                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7121
 7122        if list_of_pzfields:
 7123
 7124            # Explode Infos prefix
 7125            explode_infos_prefix = self.get_explode_infos_prefix()
 7126
 7127            # PZfields tags description
 7128            PZfields_INFOS = {
 7129                f"{pz_prefix}Tags": {
 7130                    "ID": f"{pz_prefix}Tags",
 7131                    "Number": ".",
 7132                    "Type": "String",
 7133                    "Description": "Variant tags based on annotation criteria",
 7134                },
 7135                f"{pz_prefix}Score": {
 7136                    "ID": f"{pz_prefix}Score",
 7137                    "Number": 1,
 7138                    "Type": "Integer",
 7139                    "Description": "Variant score based on annotation criteria",
 7140                },
 7141                f"{pz_prefix}Flag": {
 7142                    "ID": f"{pz_prefix}Flag",
 7143                    "Number": 1,
 7144                    "Type": "String",
 7145                    "Description": "Variant flag based on annotation criteria",
 7146                },
 7147                f"{pz_prefix}Comment": {
 7148                    "ID": f"{pz_prefix}Comment",
 7149                    "Number": ".",
 7150                    "Type": "String",
 7151                    "Description": "Variant comment based on annotation criteria",
 7152                },
 7153                f"{pz_prefix}Infos": {
 7154                    "ID": f"{pz_prefix}Infos",
 7155                    "Number": ".",
 7156                    "Type": "String",
 7157                    "Description": "Variant infos based on annotation criteria",
 7158                },
 7159                f"{pz_prefix}Class": {
 7160                    "ID": f"{pz_prefix}Class",
 7161                    "Number": ".",
 7162                    "Type": "String",
 7163                    "Description": "Variant class based on annotation criteria",
 7164                },
 7165            }
 7166
 7167            # Create INFO fields if not exist
 7168            for field in PZfields_INFOS:
 7169                field_ID = PZfields_INFOS[field]["ID"]
 7170                field_description = PZfields_INFOS[field]["Description"]
 7171                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7172                    field_description = (
 7173                        PZfields_INFOS[field]["Description"]
 7174                        + f", profile {default_profile}"
 7175                    )
 7176                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7177                        field_ID,
 7178                        PZfields_INFOS[field]["Number"],
 7179                        PZfields_INFOS[field]["Type"],
 7180                        field_description,
 7181                        "unknown",
 7182                        "unknown",
 7183                        code_type_map[PZfields_INFOS[field]["Type"]],
 7184                    )
 7185
 7186            # Create INFO fields if not exist for each profile
 7187            for profile in prioritizations_config:
 7188                if profile in profiles or profiles == []:
 7189                    for field in PZfields_INFOS:
 7190                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7191                        field_description = (
 7192                            PZfields_INFOS[field]["Description"]
 7193                            + f", profile {profile}"
 7194                        )
 7195                        if (
 7196                            field_ID not in self.get_header().infos
 7197                            and field in pzfields
 7198                        ):
 7199                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7200                                field_ID,
 7201                                PZfields_INFOS[field]["Number"],
 7202                                PZfields_INFOS[field]["Type"],
 7203                                field_description,
 7204                                "unknown",
 7205                                "unknown",
 7206                                code_type_map[PZfields_INFOS[field]["Type"]],
 7207                            )
 7208
 7209            # Header
 7210            for pzfield in list_of_pzfields:
 7211                if re.match(f"{pz_prefix}Score.*", pzfield):
 7212                    added_column = self.add_column(
 7213                        table_name=table_variants,
 7214                        column_name=pzfield,
 7215                        column_type="INTEGER",
 7216                        default_value="0",
 7217                    )
 7218                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7219                    added_column = self.add_column(
 7220                        table_name=table_variants,
 7221                        column_name=pzfield,
 7222                        column_type="BOOLEAN",
 7223                        default_value="1",
 7224                    )
 7225                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7226                    added_column = self.add_column(
 7227                        table_name=table_variants,
 7228                        column_name=pzfield,
 7229                        column_type="VARCHAR[]",
 7230                        default_value="null",
 7231                    )
 7232                else:
 7233                    added_column = self.add_column(
 7234                        table_name=table_variants,
 7235                        column_name=pzfield,
 7236                        column_type="STRING",
 7237                        default_value="''",
 7238                    )
 7239                added_columns.append(added_column)
 7240
 7241            # Profiles
 7242            if profiles:
 7243
 7244                # foreach profile in configuration file
 7245                for profile in prioritizations_config:
 7246
 7247                    # If profile is asked in param, or ALL are asked (empty profile [])
 7248                    if profile in profiles or profiles == []:
 7249                        log.info(f"Profile '{profile}'")
 7250
 7251                        sql_set_info_option = ""
 7252
 7253                        sql_set_info = []
 7254
 7255                        # PZ fields set
 7256
 7257                        # PZScore
 7258                        if (
 7259                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7260                            in list_of_pzfields
 7261                        ):
 7262                            sql_set_info.append(
 7263                                f"""
 7264                                    concat(
 7265                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7266                                        {pz_prefix}Score{pzfields_sep}{profile}
 7267                                    ) 
 7268                                """
 7269                            )
 7270                            if (
 7271                                profile == default_profile
 7272                                and f"{pz_prefix}Score" in list_of_pzfields
 7273                            ):
 7274                                sql_set_info.append(
 7275                                    f"""
 7276                                        concat(
 7277                                            '{pz_prefix}Score=',
 7278                                            {pz_prefix}Score{pzfields_sep}{profile}
 7279                                        )
 7280                                    """
 7281                                )
 7282
 7283                        # PZFlag
 7284                        if (
 7285                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7286                            in list_of_pzfields
 7287                        ):
 7288                            sql_set_info.append(
 7289                                f"""
 7290                                    concat(
 7291                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7292                                        CASE 
 7293                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7294                                            THEN 'PASS'
 7295                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7296                                            THEN 'FILTERED'
 7297                                        END
 7298                                    ) 
 7299                                """
 7300                            )
 7301                            if (
 7302                                profile == default_profile
 7303                                and f"{pz_prefix}Flag" in list_of_pzfields
 7304                            ):
 7305                                sql_set_info.append(
 7306                                    f"""
 7307                                        concat(
 7308                                            '{pz_prefix}Flag=',
 7309                                            CASE 
 7310                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7311                                                THEN 'PASS'
 7312                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7313                                                THEN 'FILTERED'
 7314                                            END
 7315                                        )
 7316                                    """
 7317                                )
 7318
 7319                        # PZClass
 7320                        if (
 7321                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7322                            in list_of_pzfields
 7323                        ):
 7324                            sql_set_info.append(
 7325                                f"""
 7326                                    concat(
 7327                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7328                                        CASE
 7329                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7330                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7331                                            ELSE '.'
 7332                                        END 
 7333                                    )
 7334                                    
 7335                                """
 7336                            )
 7337                            if (
 7338                                profile == default_profile
 7339                                and f"{pz_prefix}Class" in list_of_pzfields
 7340                            ):
 7341                                sql_set_info.append(
 7342                                    f"""
 7343                                        concat(
 7344                                            '{pz_prefix}Class=',
 7345                                            CASE
 7346                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7347                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7348                                                ELSE '.'
 7349                                            END 
 7350                                        )
 7351                                    """
 7352                                )
 7353
 7354                        # PZComment
 7355                        if (
 7356                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7357                            in list_of_pzfields
 7358                        ):
 7359                            sql_set_info.append(
 7360                                f"""
 7361                                    CASE
 7362                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7363                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7364                                        ELSE ''
 7365                                    END
 7366                                """
 7367                            )
 7368                            if (
 7369                                profile == default_profile
 7370                                and f"{pz_prefix}Comment" in list_of_pzfields
 7371                            ):
 7372                                sql_set_info.append(
 7373                                    f"""
 7374                                        CASE
 7375                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7376                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7377                                            ELSE ''
 7378                                        END
 7379                                    """
 7380                                )
 7381
 7382                        # PZInfos
 7383                        if (
 7384                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7385                            in list_of_pzfields
 7386                        ):
 7387                            sql_set_info.append(
 7388                                f"""
 7389                                    CASE
 7390                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7391                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7392                                        ELSE ''
 7393                                    END
 7394                                """
 7395                            )
 7396                            if (
 7397                                profile == default_profile
 7398                                and f"{pz_prefix}Infos" in list_of_pzfields
 7399                            ):
 7400                                sql_set_info.append(
 7401                                    f"""
 7402                                        CASE
 7403                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7404                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7405                                            ELSE ''
 7406                                        END
 7407                                    """
 7408                                )
 7409
 7410                        # Merge PZfields
 7411                        sql_set_info_option = ""
 7412                        sql_set_sep = ""
 7413                        for sql_set in sql_set_info:
 7414                            if sql_set_sep:
 7415                                sql_set_info_option += f"""
 7416                                    , concat('{sql_set_sep}', {sql_set})
 7417                                """
 7418                            else:
 7419                                sql_set_info_option += f"""
 7420                                    , {sql_set}
 7421                                """
 7422                            sql_set_sep = ";"
 7423
 7424                        sql_queries = []
 7425                        for annotation in prioritizations_config[profile]:
 7426
 7427                            # skip special sections
 7428                            if annotation.startswith("_"):
 7429                                continue
 7430
 7431                            # For each criterions
 7432                            for criterion in prioritizations_config[profile][
 7433                                annotation
 7434                            ]:
 7435
 7436                                # Criterion mode
 7437                                criterion_mode = None
 7438                                if np.any(
 7439                                    np.isin(list(criterion.keys()), ["type", "value"])
 7440                                ):
 7441                                    criterion_mode = "operation"
 7442                                elif np.any(
 7443                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7444                                ):
 7445                                    criterion_mode = "sql"
 7446                                log.debug(f"Criterion Mode: {criterion_mode}")
 7447
 7448                                # Criterion parameters
 7449                                criterion_type = criterion.get("type", None)
 7450                                criterion_value = criterion.get("value", None)
 7451                                criterion_sql = criterion.get("sql", None)
 7452                                criterion_fields = criterion.get("fields", None)
 7453                                criterion_score = criterion.get("score", 0)
 7454                                criterion_flag = criterion.get("flag", "PASS")
 7455                                criterion_class = criterion.get("class", None)
 7456                                criterion_flag_bool = criterion_flag == "PASS"
 7457                                criterion_comment = (
 7458                                    ", ".join(criterion.get("comment", []))
 7459                                    .replace("'", "''")
 7460                                    .replace(";", ",")
 7461                                    .replace("\t", " ")
 7462                                )
 7463                                criterion_infos = (
 7464                                    str(criterion)
 7465                                    .replace("'", "''")
 7466                                    .replace(";", ",")
 7467                                    .replace("\t", " ")
 7468                                )
 7469
 7470                                # SQL
 7471                                if criterion_sql is not None and isinstance(
 7472                                    criterion_sql, list
 7473                                ):
 7474                                    criterion_sql = " ".join(criterion_sql)
 7475
 7476                                # Fields and explode
 7477                                if criterion_fields is None:
 7478                                    criterion_fields = [annotation]
 7479                                if not isinstance(criterion_fields, list):
 7480                                    criterion_fields = str(criterion_fields).split(",")
 7481
 7482                                # Class
 7483                                if criterion_class is not None and not isinstance(
 7484                                    criterion_class, list
 7485                                ):
 7486                                    criterion_class = str(criterion_class).split(",")
 7487
 7488                                for annotation_field in criterion_fields:
 7489
 7490                                    # Explode specific annotation
 7491                                    log.debug(
 7492                                        f"Explode annotation '{annotation_field}'"
 7493                                    )
 7494                                    added_columns += self.explode_infos(
 7495                                        prefix=explode_infos_prefix,
 7496                                        fields=[annotation_field],
 7497                                        table=table_variants,
 7498                                    )
 7499                                    extra_infos = self.get_extra_infos(
 7500                                        table=table_variants
 7501                                    )
 7502
 7503                                    # Check if annotation field is present
 7504                                    if (
 7505                                        f"{explode_infos_prefix}{annotation_field}"
 7506                                        not in extra_infos
 7507                                    ):
 7508                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7509                                        log.error(msq_err)
 7510                                        raise ValueError(msq_err)
 7511                                    else:
 7512                                        log.debug(
 7513                                            f"Annotation '{annotation_field}' in data"
 7514                                        )
 7515
 7516                                sql_set = []
 7517                                sql_set_info = []
 7518
 7519                                # PZ fields set
 7520
 7521                                # PZScore
 7522                                if (
 7523                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7524                                    in list_of_pzfields
 7525                                ):
 7526                                    # if prioritization_score_mode == "HOWARD":
 7527                                    #     sql_set.append(
 7528                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7529                                    #     )
 7530                                    # VaRank prioritization score mode
 7531                                    if prioritization_score_mode == "VaRank":
 7532                                        sql_set.append(
 7533                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7534                                        )
 7535                                    # default HOWARD prioritization score mode
 7536                                    else:
 7537                                        sql_set.append(
 7538                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7539                                        )
 7540
 7541                                # PZFlag
 7542                                if (
 7543                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7544                                    in list_of_pzfields
 7545                                ):
 7546                                    sql_set.append(
 7547                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7548                                    )
 7549
 7550                                # PZClass
 7551                                if (
 7552                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7553                                    in list_of_pzfields
 7554                                    and criterion_class is not None
 7555                                ):
 7556                                    sql_set.append(
 7557                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7558                                    )
 7559
 7560                                # PZComment
 7561                                if (
 7562                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7563                                    in list_of_pzfields
 7564                                ):
 7565                                    sql_set.append(
 7566                                        f"""
 7567                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7568                                                concat(
 7569                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7570                                                    CASE 
 7571                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7572                                                        THEN ', '
 7573                                                        ELSE ''
 7574                                                    END,
 7575                                                    '{criterion_comment}'
 7576                                                )
 7577                                        """
 7578                                    )
 7579
 7580                                # PZInfos
 7581                                if (
 7582                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7583                                    in list_of_pzfields
 7584                                ):
 7585                                    sql_set.append(
 7586                                        f"""
 7587                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7588                                                concat(
 7589                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7590                                                    '{criterion_infos}'
 7591                                                )
 7592                                        """
 7593                                    )
 7594                                sql_set_option = ",".join(sql_set)
 7595
 7596                                # Criterion and comparison
 7597                                if sql_set_option:
 7598
 7599                                    if criterion_mode in ["operation"]:
 7600
 7601                                        try:
 7602                                            float(criterion_value)
 7603                                            sql_update = f"""
 7604                                                UPDATE {table_variants}
 7605                                                SET {sql_set_option}
 7606                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7607                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7608                                            """
 7609                                        except:
 7610                                            contains_option = ""
 7611                                            if criterion_type == "contains":
 7612                                                contains_option = ".*"
 7613                                            sql_update = f"""
 7614                                                UPDATE {table_variants}
 7615                                                SET {sql_set_option}
 7616                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7617                                            """
 7618                                        sql_queries.append(sql_update)
 7619
 7620                                    elif criterion_mode in ["sql"]:
 7621
 7622                                        sql_update = f"""
 7623                                            UPDATE {table_variants}
 7624                                            SET {sql_set_option}
 7625                                            WHERE {criterion_sql}
 7626                                        """
 7627                                        sql_queries.append(sql_update)
 7628
 7629                                    else:
 7630                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7631                                        log.error(msg_err)
 7632                                        raise ValueError(msg_err)
 7633
 7634                                else:
 7635                                    log.warning(
 7636                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7637                                    )
 7638
 7639                        # PZTags
 7640                        if (
 7641                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7642                            in list_of_pzfields
 7643                        ):
 7644
 7645                            # Create PZFalgs value
 7646                            pztags_value = ""
 7647                            pztags_sep_default = ","
 7648                            pztags_sep = ""
 7649                            for pzfield in pzfields:
 7650                                if pzfield not in [f"{pz_prefix}Tags"]:
 7651                                    if (
 7652                                        f"{pzfield}{pzfields_sep}{profile}"
 7653                                        in list_of_pzfields
 7654                                    ):
 7655                                        if pzfield in [f"{pz_prefix}Flag"]:
 7656                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7657                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7658                                                    THEN 'PASS'
 7659                                                    ELSE 'FILTERED'
 7660                                                END, '"""
 7661                                        elif pzfield in [f"{pz_prefix}Class"]:
 7662                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7663                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7664                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7665                                                    ELSE '.'
 7666                                                END, '"""
 7667                                        else:
 7668                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7669                                        pztags_sep = pztags_sep_default
 7670
 7671                            # Add Query update for PZFlags
 7672                            sql_update_pztags = f"""
 7673                                UPDATE {table_variants}
 7674                                SET INFO = concat(
 7675                                        INFO,
 7676                                        CASE WHEN INFO NOT in ('','.')
 7677                                                THEN ';'
 7678                                                ELSE ''
 7679                                        END,
 7680                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7681                                    )
 7682                                """
 7683                            sql_queries.append(sql_update_pztags)
 7684
 7685                            # Add Query update for PZFlags for default
 7686                            if profile == default_profile:
 7687                                sql_update_pztags_default = f"""
 7688                                UPDATE {table_variants}
 7689                                SET INFO = concat(
 7690                                        INFO,
 7691                                        ';',
 7692                                        '{pz_prefix}Tags={pztags_value}'
 7693                                    )
 7694                                """
 7695                                sql_queries.append(sql_update_pztags_default)
 7696
 7697                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7698
 7699                        if sql_queries:
 7700
 7701                            for sql_query in sql_queries:
 7702                                log.debug(
 7703                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7704                                )
 7705                                self.conn.execute(sql_query)
 7706
 7707                        log.info(f"""Profile '{profile}' - Update... """)
 7708                        sql_query_update = f"""
 7709                            UPDATE {table_variants}
 7710                            SET INFO =  
 7711                                concat(
 7712                                    CASE
 7713                                        WHEN INFO NOT IN ('','.')
 7714                                        THEN concat(INFO, ';')
 7715                                        ELSE ''
 7716                                    END
 7717                                    {sql_set_info_option}
 7718                                )
 7719                        """
 7720                        self.conn.execute(sql_query_update)
 7721
 7722        else:
 7723
 7724            log.warning(f"No profiles in parameters")
 7725
 7726        # Remove added columns
 7727        for added_column in added_columns:
 7728            self.drop_column(column=added_column)
 7729
 7730        # Explode INFOS fields into table fields
 7731        if self.get_explode_infos():
 7732            self.explode_infos(
 7733                prefix=self.get_explode_infos_prefix(),
 7734                fields=self.get_explode_infos_fields(),
 7735                force=True,
 7736            )
 7737
 7738        return True
 7739
 7740    ###
 7741    # HGVS
 7742    ###
 7743
 7744    def annotation_hgvs(self, threads: int = None) -> None:
 7745        """
 7746        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7747        coordinates and alleles.
 7748
 7749        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7750        threads to use for parallel processing. If no value is provided, it will default to the number
 7751        of threads obtained from the `get_threads()` method
 7752        :type threads: int
 7753        """
 7754
 7755        # Function for each partition of the Dask Dataframe
 7756        def partition_function(partition):
 7757            """
 7758            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7759            each row of a DataFrame called `partition`.
 7760
 7761            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7762            to be processed
 7763            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7764            the "partition" dataframe along the axis 1.
 7765            """
 7766            return partition.apply(annotation_hgvs_partition, axis=1)
 7767
 7768        def annotation_hgvs_partition(row) -> str:
 7769            """
 7770            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7771            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7772
 7773            :param row: A dictionary-like object that contains the values for the following keys:
 7774            :return: a string that contains the HGVS names associated with the given row of data.
 7775            """
 7776
 7777            chr = row["CHROM"]
 7778            pos = row["POS"]
 7779            ref = row["REF"]
 7780            alt = row["ALT"]
 7781
 7782            # Find list of associated transcripts
 7783            transcripts_list = list(
 7784                polars_conn.execute(
 7785                    f"""
 7786                SELECT transcript
 7787                FROM refseq_df
 7788                WHERE CHROM='{chr}'
 7789                AND POS={pos}
 7790            """
 7791                )["transcript"]
 7792            )
 7793
 7794            # Full HGVS annotation in list
 7795            hgvs_full_list = []
 7796
 7797            for transcript_name in transcripts_list:
 7798
 7799                # Transcript
 7800                transcript = get_transcript(
 7801                    transcripts=transcripts, transcript_name=transcript_name
 7802                )
 7803                # Exon
 7804                if use_exon:
 7805                    exon = transcript.find_exon_number(pos)
 7806                else:
 7807                    exon = None
 7808                # Protein
 7809                transcript_protein = None
 7810                if use_protein or add_protein or full_format:
 7811                    transcripts_protein = list(
 7812                        polars_conn.execute(
 7813                            f"""
 7814                        SELECT protein
 7815                        FROM refseqlink_df
 7816                        WHERE transcript='{transcript_name}'
 7817                        LIMIT 1
 7818                    """
 7819                        )["protein"]
 7820                    )
 7821                    if len(transcripts_protein):
 7822                        transcript_protein = transcripts_protein[0]
 7823
 7824                # HGVS name
 7825                hgvs_name = format_hgvs_name(
 7826                    chr,
 7827                    pos,
 7828                    ref,
 7829                    alt,
 7830                    genome=genome,
 7831                    transcript=transcript,
 7832                    transcript_protein=transcript_protein,
 7833                    exon=exon,
 7834                    use_gene=use_gene,
 7835                    use_protein=use_protein,
 7836                    full_format=full_format,
 7837                    use_version=use_version,
 7838                    codon_type=codon_type,
 7839                )
 7840                hgvs_full_list.append(hgvs_name)
 7841                if add_protein and not use_protein and not full_format:
 7842                    hgvs_name = format_hgvs_name(
 7843                        chr,
 7844                        pos,
 7845                        ref,
 7846                        alt,
 7847                        genome=genome,
 7848                        transcript=transcript,
 7849                        transcript_protein=transcript_protein,
 7850                        exon=exon,
 7851                        use_gene=use_gene,
 7852                        use_protein=True,
 7853                        full_format=False,
 7854                        use_version=use_version,
 7855                        codon_type=codon_type,
 7856                    )
 7857                    hgvs_full_list.append(hgvs_name)
 7858
 7859            # Create liste of HGVS annotations
 7860            hgvs_full = ",".join(hgvs_full_list)
 7861
 7862            return hgvs_full
 7863
 7864        # Polars connexion
 7865        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7866
 7867        # Config
 7868        config = self.get_config()
 7869
 7870        # Databases
 7871        # Genome
 7872        databases_genomes_folders = (
 7873            config.get("folders", {})
 7874            .get("databases", {})
 7875            .get("genomes", DEFAULT_GENOME_FOLDER)
 7876        )
 7877        databases_genome = (
 7878            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7879        )
 7880        # refseq database folder
 7881        databases_refseq_folders = (
 7882            config.get("folders", {})
 7883            .get("databases", {})
 7884            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7885        )
 7886        # refseq
 7887        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7888        # refSeqLink
 7889        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7890
 7891        # Param
 7892        param = self.get_param()
 7893
 7894        # Quick HGVS
 7895        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7896            log.info(f"Quick HGVS Annotation:")
 7897            if not param.get("hgvs", None):
 7898                param["hgvs"] = {}
 7899            for option in param.get("hgvs_options", "").split(","):
 7900                option_var_val = option.split("=")
 7901                option_var = option_var_val[0]
 7902                if len(option_var_val) > 1:
 7903                    option_val = option_var_val[1]
 7904                else:
 7905                    option_val = "True"
 7906                if option_val.upper() in ["TRUE"]:
 7907                    option_val = True
 7908                elif option_val.upper() in ["FALSE"]:
 7909                    option_val = False
 7910                log.info(f"   {option_var}={option_val}")
 7911                param["hgvs"][option_var] = option_val
 7912
 7913        # Check if HGVS annotation enabled
 7914        if "hgvs" in param:
 7915            log.info(f"HGVS Annotation... ")
 7916            for hgvs_option in param.get("hgvs", {}):
 7917                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7918        else:
 7919            return
 7920
 7921        # HGVS Param
 7922        param_hgvs = param.get("hgvs", {})
 7923        use_exon = param_hgvs.get("use_exon", False)
 7924        use_gene = param_hgvs.get("use_gene", False)
 7925        use_protein = param_hgvs.get("use_protein", False)
 7926        add_protein = param_hgvs.get("add_protein", False)
 7927        full_format = param_hgvs.get("full_format", False)
 7928        use_version = param_hgvs.get("use_version", False)
 7929        codon_type = param_hgvs.get("codon_type", "3")
 7930
 7931        # refSseq refSeqLink
 7932        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7933        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7934
 7935        # Assembly
 7936        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7937
 7938        # Genome
 7939        genome_file = None
 7940        if find_genome(databases_genome):
 7941            genome_file = find_genome(databases_genome)
 7942        else:
 7943            genome_file = find_genome(
 7944                genome_path=databases_genomes_folders, assembly=assembly
 7945            )
 7946        log.debug("Genome: " + str(genome_file))
 7947
 7948        # refSseq
 7949        refseq_file = find_file_prefix(
 7950            input_file=databases_refseq,
 7951            prefix="ncbiRefSeq",
 7952            folder=databases_refseq_folders,
 7953            assembly=assembly,
 7954        )
 7955        log.debug("refSeq: " + str(refseq_file))
 7956
 7957        # refSeqLink
 7958        refseqlink_file = find_file_prefix(
 7959            input_file=databases_refseqlink,
 7960            prefix="ncbiRefSeqLink",
 7961            folder=databases_refseq_folders,
 7962            assembly=assembly,
 7963        )
 7964        log.debug("refSeqLink: " + str(refseqlink_file))
 7965
 7966        # Threads
 7967        if not threads:
 7968            threads = self.get_threads()
 7969        log.debug("Threads: " + str(threads))
 7970
 7971        # Variables
 7972        table_variants = self.get_table_variants(clause="update")
 7973
 7974        # Get variants SNV and InDel only
 7975        query_variants = f"""
 7976            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7977            FROM {table_variants}
 7978            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7979            """
 7980        df_variants = self.get_query_to_df(query_variants)
 7981
 7982        # Added columns
 7983        added_columns = []
 7984
 7985        # Add hgvs column in variants table
 7986        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7987        added_column = self.add_column(
 7988            table_variants, hgvs_column_name, "STRING", default_value=None
 7989        )
 7990        added_columns.append(added_column)
 7991
 7992        log.debug(f"refSeq loading...")
 7993        # refSeq in duckDB
 7994        refseq_table = get_refseq_table(
 7995            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7996        )
 7997        # Loading all refSeq in Dataframe
 7998        refseq_query = f"""
 7999            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8000            FROM {refseq_table}
 8001            JOIN df_variants ON (
 8002                {refseq_table}.chrom = df_variants.CHROM
 8003                AND {refseq_table}.txStart<=df_variants.POS
 8004                AND {refseq_table}.txEnd>=df_variants.POS
 8005            )
 8006        """
 8007        refseq_df = self.conn.query(refseq_query).pl()
 8008
 8009        if refseqlink_file:
 8010            log.debug(f"refSeqLink loading...")
 8011            # refSeqLink in duckDB
 8012            refseqlink_table = get_refseq_table(
 8013                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8014            )
 8015            # Loading all refSeqLink in Dataframe
 8016            protacc_column = "protAcc_with_ver"
 8017            mrnaacc_column = "mrnaAcc_with_ver"
 8018            refseqlink_query = f"""
 8019                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8020                FROM {refseqlink_table} 
 8021                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8022                WHERE protAcc_without_ver IS NOT NULL
 8023            """
 8024            # Polars Dataframe
 8025            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8026
 8027        # Read RefSeq transcripts into a python dict/model.
 8028        log.debug(f"Transcripts loading...")
 8029        with tempfile.TemporaryDirectory() as tmpdir:
 8030            transcripts_query = f"""
 8031                COPY (
 8032                    SELECT {refseq_table}.*
 8033                    FROM {refseq_table}
 8034                    JOIN df_variants ON (
 8035                        {refseq_table}.chrom=df_variants.CHROM
 8036                        AND {refseq_table}.txStart<=df_variants.POS
 8037                        AND {refseq_table}.txEnd>=df_variants.POS
 8038                    )
 8039                )
 8040                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8041            """
 8042            self.conn.query(transcripts_query)
 8043            with open(f"{tmpdir}/transcript.tsv") as infile:
 8044                transcripts = read_transcripts(infile)
 8045
 8046        # Polars connexion
 8047        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8048
 8049        log.debug("Genome loading...")
 8050        # Read genome sequence using pyfaidx.
 8051        genome = Fasta(genome_file)
 8052
 8053        log.debug("Start annotation HGVS...")
 8054
 8055        # Create
 8056        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8057        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8058
 8059        # Use dask.dataframe.apply() to apply function on each partition
 8060        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8061
 8062        # Convert Dask DataFrame to Pandas Dataframe
 8063        df = ddf.compute()
 8064
 8065        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8066        with tempfile.TemporaryDirectory() as tmpdir:
 8067            df_parquet = os.path.join(tmpdir, "df.parquet")
 8068            df.to_parquet(df_parquet)
 8069
 8070            # Update hgvs column
 8071            update_variant_query = f"""
 8072                UPDATE {table_variants}
 8073                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8074                FROM read_parquet('{df_parquet}') as df
 8075                WHERE variants."#CHROM" = df.CHROM
 8076                AND variants.POS = df.POS
 8077                AND variants.REF = df.REF
 8078                AND variants.ALT = df.ALT
 8079                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8080                """
 8081            self.execute_query(update_variant_query)
 8082
 8083        # Update INFO column
 8084        sql_query_update = f"""
 8085            UPDATE {table_variants}
 8086            SET INFO = 
 8087                concat(
 8088                    CASE 
 8089                        WHEN INFO NOT IN ('','.')
 8090                        THEN concat(INFO, ';')
 8091                        ELSE ''
 8092                    END,
 8093                    'hgvs=',
 8094                    {hgvs_column_name}
 8095                )
 8096            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8097            """
 8098        self.execute_query(sql_query_update)
 8099
 8100        # Add header
 8101        HGVS_INFOS = {
 8102            "hgvs": {
 8103                "ID": "hgvs",
 8104                "Number": ".",
 8105                "Type": "String",
 8106                "Description": f"HGVS annotatation with HOWARD",
 8107            }
 8108        }
 8109
 8110        for field in HGVS_INFOS:
 8111            field_ID = HGVS_INFOS[field]["ID"]
 8112            field_description = HGVS_INFOS[field]["Description"]
 8113            self.get_header().infos[field_ID] = vcf.parser._Info(
 8114                field_ID,
 8115                HGVS_INFOS[field]["Number"],
 8116                HGVS_INFOS[field]["Type"],
 8117                field_description,
 8118                "unknown",
 8119                "unknown",
 8120                code_type_map[HGVS_INFOS[field]["Type"]],
 8121            )
 8122
 8123        # Remove added columns
 8124        for added_column in added_columns:
 8125            self.drop_column(column=added_column)
 8126
 8127    ###
 8128    # Calculation
 8129    ###
 8130
 8131    def get_operations_help(
 8132        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8133    ) -> list:
 8134
 8135        # Init
 8136        operations_help = []
 8137
 8138        # operations
 8139        operations = self.get_config_json(
 8140            name="calculations",
 8141            config_dict=operations_config_dict,
 8142            config_file=operations_config_file,
 8143        )
 8144        for op in operations:
 8145            op_name = operations[op].get("name", op).upper()
 8146            op_description = operations[op].get("description", op_name)
 8147            op_available = operations[op].get("available", False)
 8148            if op_available:
 8149                operations_help.append(f"   {op_name}: {op_description}")
 8150
 8151        # Sort operations
 8152        operations_help.sort()
 8153
 8154        # insert header
 8155        operations_help.insert(0, "Available calculation operations:")
 8156
 8157        # Return
 8158        return operations_help
 8159
 8160    def calculation(
 8161        self,
 8162        operations: dict = {},
 8163        operations_config_dict: dict = {},
 8164        operations_config_file: str = None,
 8165    ) -> None:
 8166        """
 8167        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8168        operation, and then calls the appropriate function
 8169
 8170        param json example:
 8171            "calculation": {
 8172                "NOMEN": {
 8173                    "options": {
 8174                        "hgvs_field": "hgvs"
 8175                    },
 8176                "middle" : null
 8177            }
 8178        """
 8179
 8180        # Param
 8181        param = self.get_param()
 8182
 8183        # operations config
 8184        operations_config = self.get_config_json(
 8185            name="calculations",
 8186            config_dict=operations_config_dict,
 8187            config_file=operations_config_file,
 8188        )
 8189
 8190        # Upper keys
 8191        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8192
 8193        # Calculations
 8194
 8195        # Operations from param
 8196        operations = param.get("calculation", {}).get("calculations", operations)
 8197
 8198        # Quick calculation - add
 8199        if param.get("calculations", None):
 8200
 8201            # List of operations
 8202            calculations_list = [
 8203                value.strip() for value in param.get("calculations", "").split(",")
 8204            ]
 8205
 8206            # Log
 8207            log.info(f"Quick Calculations:")
 8208            for calculation_key in calculations_list:
 8209                log.info(f"   {calculation_key}")
 8210
 8211            # Create tmp operations (to keep operation order)
 8212            operations_tmp = {}
 8213            for calculation_operation in calculations_list:
 8214                if calculation_operation.upper() not in operations_tmp:
 8215                    log.debug(
 8216                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8217                    )
 8218                    operations_tmp[calculation_operation.upper()] = {}
 8219                    add_value_into_dict(
 8220                        dict_tree=operations_tmp,
 8221                        sections=[
 8222                            calculation_operation.upper(),
 8223                        ],
 8224                        value=operations.get(calculation_operation.upper(), {}),
 8225                    )
 8226            # Add operations already in param
 8227            for calculation_operation in operations:
 8228                if calculation_operation not in operations_tmp:
 8229                    operations_tmp[calculation_operation] = operations.get(
 8230                        calculation_operation, {}
 8231                    )
 8232
 8233            # Update operations in param
 8234            operations = operations_tmp
 8235
 8236        # Operations for calculation
 8237        if not operations:
 8238            operations = param.get("calculation", {}).get("calculations", {})
 8239
 8240        if operations:
 8241            log.info(f"Calculations...")
 8242
 8243        # For each operations
 8244        for operation_name in operations:
 8245            operation_name = operation_name.upper()
 8246            if operation_name not in [""]:
 8247                if operation_name in operations_config:
 8248                    log.info(f"Calculation '{operation_name}'")
 8249                    operation = operations_config[operation_name]
 8250                    operation_type = operation.get("type", "sql")
 8251                    if operation_type == "python":
 8252                        self.calculation_process_function(
 8253                            operation=operation, operation_name=operation_name
 8254                        )
 8255                    elif operation_type == "sql":
 8256                        self.calculation_process_sql(
 8257                            operation=operation, operation_name=operation_name
 8258                        )
 8259                    else:
 8260                        log.error(
 8261                            f"Operations config: Type '{operation_type}' NOT available"
 8262                        )
 8263                        raise ValueError(
 8264                            f"Operations config: Type '{operation_type}' NOT available"
 8265                        )
 8266                else:
 8267                    log.error(
 8268                        f"Operations config: Calculation '{operation_name}' NOT available"
 8269                    )
 8270                    raise ValueError(
 8271                        f"Operations config: Calculation '{operation_name}' NOT available"
 8272                    )
 8273
 8274        # Explode INFOS fields into table fields
 8275        if self.get_explode_infos():
 8276            self.explode_infos(
 8277                prefix=self.get_explode_infos_prefix(),
 8278                fields=self.get_explode_infos_fields(),
 8279                force=True,
 8280            )
 8281
 8282    def calculation_process_sql(
 8283        self, operation: dict, operation_name: str = "unknown"
 8284    ) -> None:
 8285        """
 8286        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8287        performs the operation, updating the specified table with the result.
 8288
 8289        :param operation: The `operation` parameter is a dictionary that contains information about the
 8290        mathematical operation to be performed. It includes the following keys:
 8291        :type operation: dict
 8292        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8293        the mathematical operation being performed. It is used for logging and error handling purposes,
 8294        defaults to unknown
 8295        :type operation_name: str (optional)
 8296        """
 8297
 8298        # table variants
 8299        table_variants = self.get_table_variants(clause="alter")
 8300
 8301        # Operation infos
 8302        operation_name = operation.get("name", "unknown")
 8303        log.debug(f"process sql {operation_name}")
 8304        output_column_name = operation.get("output_column_name", operation_name)
 8305        output_column_type = operation.get("output_column_type", "String")
 8306        prefix = operation.get("explode_infos_prefix", "")
 8307        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8308        output_column_description = operation.get(
 8309            "output_column_description", f"{operation_name} operation"
 8310        )
 8311        operation_query = operation.get("operation_query", None)
 8312        if isinstance(operation_query, list):
 8313            operation_query = " ".join(operation_query)
 8314        operation_info_fields = operation.get("info_fields", [])
 8315        operation_info_fields_check = operation.get("info_fields_check", False)
 8316        operation_info = operation.get("operation_info", True)
 8317
 8318        if operation_query:
 8319
 8320            # Info fields check
 8321            operation_info_fields_check_result = True
 8322            if operation_info_fields_check:
 8323                header_infos = self.get_header().infos
 8324                for info_field in operation_info_fields:
 8325                    operation_info_fields_check_result = (
 8326                        operation_info_fields_check_result
 8327                        and info_field in header_infos
 8328                    )
 8329
 8330            # If info fields available
 8331            if operation_info_fields_check_result:
 8332
 8333                # Added_columns
 8334                added_columns = []
 8335
 8336                # Create VCF header field
 8337                vcf_reader = self.get_header()
 8338                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8339                    output_column_name,
 8340                    ".",
 8341                    output_column_type,
 8342                    output_column_description,
 8343                    "howard calculation",
 8344                    "0",
 8345                    self.code_type_map.get(output_column_type),
 8346                )
 8347
 8348                # Explode infos if needed
 8349                log.debug(f"calculation_process_sql prefix {prefix}")
 8350                added_columns += self.explode_infos(
 8351                    prefix=prefix,
 8352                    fields=[output_column_name] + operation_info_fields,
 8353                    force=True,
 8354                )
 8355
 8356                # Create column
 8357                added_column = self.add_column(
 8358                    table_name=table_variants,
 8359                    column_name=prefix + output_column_name,
 8360                    column_type=output_column_type_sql,
 8361                    default_value="null",
 8362                )
 8363                added_columns.append(added_column)
 8364
 8365                # Operation calculation
 8366                try:
 8367
 8368                    # Query to update calculation column
 8369                    sql_update = f"""
 8370                        UPDATE {table_variants}
 8371                        SET "{prefix}{output_column_name}" = ({operation_query})
 8372                    """
 8373                    self.conn.execute(sql_update)
 8374
 8375                    # Add to INFO
 8376                    if operation_info:
 8377                        sql_update_info = f"""
 8378                            UPDATE {table_variants}
 8379                            SET "INFO" =
 8380                                concat(
 8381                                    CASE
 8382                                        WHEN "INFO" IS NOT NULL
 8383                                        THEN concat("INFO", ';')
 8384                                        ELSE ''
 8385                                    END,
 8386                                    '{output_column_name}=',
 8387                                    "{prefix}{output_column_name}"
 8388                                )
 8389                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8390                        """
 8391                        self.conn.execute(sql_update_info)
 8392
 8393                except:
 8394                    log.error(
 8395                        f"Operations config: Calculation '{operation_name}' query failed"
 8396                    )
 8397                    raise ValueError(
 8398                        f"Operations config: Calculation '{operation_name}' query failed"
 8399                    )
 8400
 8401                # Remove added columns
 8402                for added_column in added_columns:
 8403                    log.debug(f"added_column: {added_column}")
 8404                    self.drop_column(column=added_column)
 8405
 8406            else:
 8407                log.error(
 8408                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8409                )
 8410                raise ValueError(
 8411                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8412                )
 8413
 8414        else:
 8415            log.error(
 8416                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8417            )
 8418            raise ValueError(
 8419                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8420            )
 8421
 8422    def calculation_process_function(
 8423        self, operation: dict, operation_name: str = "unknown"
 8424    ) -> None:
 8425        """
 8426        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8427        function with the given parameters.
 8428
 8429        :param operation: The `operation` parameter is a dictionary that contains information about the
 8430        operation to be performed. It has the following keys:
 8431        :type operation: dict
 8432        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8433        the operation being performed. It is used for logging purposes, defaults to unknown
 8434        :type operation_name: str (optional)
 8435        """
 8436
 8437        operation_name = operation["name"]
 8438        log.debug(f"process sql {operation_name}")
 8439        function_name = operation["function_name"]
 8440        function_params = operation["function_params"]
 8441        getattr(self, function_name)(*function_params)
 8442
 8443    def calculation_variant_id(self) -> None:
 8444        """
 8445        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8446        updates the INFO field of a variants table with the variant ID.
 8447        """
 8448
 8449        # variant_id annotation field
 8450        variant_id_tag = self.get_variant_id_column()
 8451        added_columns = [variant_id_tag]
 8452
 8453        # variant_id hgvs tags"
 8454        vcf_infos_tags = {
 8455            variant_id_tag: "howard variant ID annotation",
 8456        }
 8457
 8458        # Variants table
 8459        table_variants = self.get_table_variants()
 8460
 8461        # Header
 8462        vcf_reader = self.get_header()
 8463
 8464        # Add variant_id to header
 8465        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8466            variant_id_tag,
 8467            ".",
 8468            "String",
 8469            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8470            "howard calculation",
 8471            "0",
 8472            self.code_type_map.get("String"),
 8473        )
 8474
 8475        # Update
 8476        sql_update = f"""
 8477            UPDATE {table_variants}
 8478            SET "INFO" = 
 8479                concat(
 8480                    CASE
 8481                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8482                        THEN ''
 8483                        ELSE concat("INFO", ';')
 8484                    END,
 8485                    '{variant_id_tag}=',
 8486                    "{variant_id_tag}"
 8487                )
 8488        """
 8489        self.conn.execute(sql_update)
 8490
 8491        # Remove added columns
 8492        for added_column in added_columns:
 8493            self.drop_column(column=added_column)
 8494
 8495    def calculation_extract_snpeff_hgvs(
 8496        self,
 8497        snpeff_hgvs: str = "snpeff_hgvs",
 8498        snpeff_field: str = "ANN",
 8499    ) -> None:
 8500        """
 8501        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8502        annotation field in a VCF file and adds them as a new column in the variants table.
 8503
 8504        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8505        function is used to specify the name of the column that will store the HGVS nomenclatures
 8506        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8507        snpeff_hgvs
 8508        :type snpeff_hgvs: str (optional)
 8509        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8510        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8511        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8512        to ANN
 8513        :type snpeff_field: str (optional)
 8514        """
 8515
 8516        # Snpeff hgvs tags
 8517        vcf_infos_tags = {
 8518            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8519        }
 8520
 8521        # Prefix
 8522        prefix = self.get_explode_infos_prefix()
 8523        if prefix:
 8524            prefix = "INFO/"
 8525
 8526        # snpEff fields
 8527        speff_ann_infos = prefix + snpeff_field
 8528        speff_hgvs_infos = prefix + snpeff_hgvs
 8529
 8530        # Variants table
 8531        table_variants = self.get_table_variants()
 8532
 8533        # Header
 8534        vcf_reader = self.get_header()
 8535
 8536        # Add columns
 8537        added_columns = []
 8538
 8539        # Explode HGVS field in column
 8540        added_columns += self.explode_infos(fields=[snpeff_field])
 8541
 8542        if snpeff_field in vcf_reader.infos:
 8543
 8544            log.debug(vcf_reader.infos[snpeff_field])
 8545
 8546            # Extract ANN header
 8547            ann_description = vcf_reader.infos[snpeff_field].desc
 8548            pattern = r"'(.+?)'"
 8549            match = re.search(pattern, ann_description)
 8550            if match:
 8551                ann_header_match = match.group(1).split(" | ")
 8552                ann_header_desc = {}
 8553                for i in range(len(ann_header_match)):
 8554                    ann_header_info = "".join(
 8555                        char for char in ann_header_match[i] if char.isalnum()
 8556                    )
 8557                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8558                if not ann_header_desc:
 8559                    raise ValueError("Invalid header description format")
 8560            else:
 8561                raise ValueError("Invalid header description format")
 8562
 8563            # Create variant id
 8564            variant_id_column = self.get_variant_id_column()
 8565            added_columns += [variant_id_column]
 8566
 8567            # Create dataframe
 8568            dataframe_snpeff_hgvs = self.get_query_to_df(
 8569                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8570            )
 8571
 8572            # Create main NOMEN column
 8573            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8574                speff_ann_infos
 8575            ].apply(
 8576                lambda x: extract_snpeff_hgvs(
 8577                    str(x), header=list(ann_header_desc.values())
 8578                )
 8579            )
 8580
 8581            # Add snpeff_hgvs to header
 8582            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8583                snpeff_hgvs,
 8584                ".",
 8585                "String",
 8586                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8587                "howard calculation",
 8588                "0",
 8589                self.code_type_map.get("String"),
 8590            )
 8591
 8592            # Update
 8593            sql_update = f"""
 8594                UPDATE variants
 8595                SET "INFO" = 
 8596                    concat(
 8597                        CASE
 8598                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8599                            THEN ''
 8600                            ELSE concat("INFO", ';')
 8601                        END,
 8602                        CASE 
 8603                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8604                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8605                            THEN concat(
 8606                                    '{snpeff_hgvs}=',
 8607                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8608                                )
 8609                            ELSE ''
 8610                        END
 8611                    )
 8612                FROM dataframe_snpeff_hgvs
 8613                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8614
 8615            """
 8616            self.conn.execute(sql_update)
 8617
 8618            # Delete dataframe
 8619            del dataframe_snpeff_hgvs
 8620            gc.collect()
 8621
 8622        else:
 8623
 8624            log.warning(
 8625                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8626            )
 8627
 8628        # Remove added columns
 8629        for added_column in added_columns:
 8630            self.drop_column(column=added_column)
 8631
 8632    def calculation_snpeff_ann_explode(
 8633        self,
 8634        uniquify: bool = True,
 8635        output_format: str = "fields",
 8636        output_prefix: str = "snpeff_",
 8637        snpeff_field: str = "ANN",
 8638    ) -> None:
 8639        """
 8640        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8641        exploding the HGVS field and updating variant information accordingly.
 8642
 8643        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8644        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8645        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8646        defaults to True
 8647        :type uniquify: bool (optional)
 8648        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8649        function specifies the format in which the output annotations will be generated. It has a
 8650        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8651        format, defaults to fields
 8652        :type output_format: str (optional)
 8653        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8654        method is used to specify the prefix that will be added to the output annotations generated
 8655        during the calculation process. This prefix helps to differentiate the newly added annotations
 8656        from existing ones in the output data. By default, the, defaults to ANN_
 8657        :type output_prefix: str (optional)
 8658        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8659        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8660        field will be processed to explode the HGVS annotations and update the variant information
 8661        accordingly, defaults to ANN
 8662        :type snpeff_field: str (optional)
 8663        """
 8664
 8665        # SnpEff annotation field
 8666        snpeff_hgvs = "snpeff_ann_explode"
 8667
 8668        # Snpeff hgvs tags
 8669        vcf_infos_tags = {
 8670            snpeff_hgvs: "Explode snpEff annotations",
 8671        }
 8672
 8673        # Prefix
 8674        prefix = self.get_explode_infos_prefix()
 8675        if prefix:
 8676            prefix = "INFO/"
 8677
 8678        # snpEff fields
 8679        speff_ann_infos = prefix + snpeff_field
 8680        speff_hgvs_infos = prefix + snpeff_hgvs
 8681
 8682        # Variants table
 8683        table_variants = self.get_table_variants()
 8684
 8685        # Header
 8686        vcf_reader = self.get_header()
 8687
 8688        # Add columns
 8689        added_columns = []
 8690
 8691        # Explode HGVS field in column
 8692        added_columns += self.explode_infos(fields=[snpeff_field])
 8693        log.debug(f"snpeff_field={snpeff_field}")
 8694        log.debug(f"added_columns={added_columns}")
 8695
 8696        if snpeff_field in vcf_reader.infos:
 8697
 8698            # Extract ANN header
 8699            ann_description = vcf_reader.infos[snpeff_field].desc
 8700            pattern = r"'(.+?)'"
 8701            match = re.search(pattern, ann_description)
 8702            if match:
 8703                ann_header_match = match.group(1).split(" | ")
 8704                ann_header = []
 8705                ann_header_desc = {}
 8706                for i in range(len(ann_header_match)):
 8707                    ann_header_info = "".join(
 8708                        char for char in ann_header_match[i] if char.isalnum()
 8709                    )
 8710                    ann_header.append(ann_header_info)
 8711                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8712                if not ann_header_desc:
 8713                    raise ValueError("Invalid header description format")
 8714            else:
 8715                raise ValueError("Invalid header description format")
 8716
 8717            # Create variant id
 8718            variant_id_column = self.get_variant_id_column()
 8719            added_columns += [variant_id_column]
 8720
 8721            # Create dataframe
 8722            dataframe_snpeff_hgvs = self.get_query_to_df(
 8723                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8724            )
 8725
 8726            # Create snpEff columns
 8727            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8728                speff_ann_infos
 8729            ].apply(
 8730                lambda x: explode_snpeff_ann(
 8731                    str(x),
 8732                    uniquify=uniquify,
 8733                    output_format=output_format,
 8734                    prefix=output_prefix,
 8735                    header=list(ann_header_desc.values()),
 8736                )
 8737            )
 8738
 8739            # Header
 8740            ann_annotations_prefix = ""
 8741            if output_format.upper() in ["JSON"]:
 8742                ann_annotations_prefix = f"{output_prefix}="
 8743                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8744                    output_prefix,
 8745                    ".",
 8746                    "String",
 8747                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8748                    + " - JSON format",
 8749                    "howard calculation",
 8750                    "0",
 8751                    self.code_type_map.get("String"),
 8752                )
 8753            else:
 8754                for ann_annotation in ann_header:
 8755                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8756                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8757                        ann_annotation_id,
 8758                        ".",
 8759                        "String",
 8760                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8761                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8762                        "howard calculation",
 8763                        "0",
 8764                        self.code_type_map.get("String"),
 8765                    )
 8766
 8767            # Update
 8768            sql_update = f"""
 8769                UPDATE variants
 8770                SET "INFO" = 
 8771                    concat(
 8772                        CASE
 8773                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8774                            THEN ''
 8775                            ELSE concat("INFO", ';')
 8776                        END,
 8777                        CASE 
 8778                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8779                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8780                            THEN concat(
 8781                                '{ann_annotations_prefix}',
 8782                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8783                                )
 8784                            ELSE ''
 8785                        END
 8786                    )
 8787                FROM dataframe_snpeff_hgvs
 8788                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8789
 8790            """
 8791            self.conn.execute(sql_update)
 8792
 8793            # Delete dataframe
 8794            del dataframe_snpeff_hgvs
 8795            gc.collect()
 8796
 8797        else:
 8798
 8799            log.warning(
 8800                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8801            )
 8802
 8803        # Remove added columns
 8804        for added_column in added_columns:
 8805            self.drop_column(column=added_column)
 8806
 8807    def calculation_extract_nomen(self) -> None:
 8808        """
 8809        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8810        """
 8811
 8812        # NOMEN field
 8813        field_nomen_dict = "NOMEN_DICT"
 8814
 8815        # NOMEN structure
 8816        nomen_dict = {
 8817            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8818            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8819            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8820            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8821            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8822            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8823            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8824            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8825            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8826            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8827        }
 8828
 8829        # Param
 8830        param = self.get_param()
 8831
 8832        # Prefix
 8833        prefix = self.get_explode_infos_prefix()
 8834
 8835        # Header
 8836        vcf_reader = self.get_header()
 8837
 8838        # Added columns
 8839        added_columns = []
 8840
 8841        # Get HGVS field
 8842        hgvs_field = (
 8843            param.get("calculation", {})
 8844            .get("calculations", {})
 8845            .get("NOMEN", {})
 8846            .get("options", {})
 8847            .get("hgvs_field", "hgvs")
 8848        )
 8849
 8850        # Get NOMEN pattern
 8851        nomen_pattern = (
 8852            param.get("calculation", {})
 8853            .get("calculations", {})
 8854            .get("NOMEN", {})
 8855            .get("options", {})
 8856            .get("pattern", None)
 8857        )
 8858
 8859        # transcripts list of preference sources
 8860        transcripts_sources = {}
 8861
 8862        # Get transcripts
 8863        transcripts_file = (
 8864            param.get("calculation", {})
 8865            .get("calculations", {})
 8866            .get("NOMEN", {})
 8867            .get("options", {})
 8868            .get("transcripts", None)
 8869        )
 8870        transcripts_file = full_path(transcripts_file)
 8871        if transcripts_file:
 8872            if os.path.exists(transcripts_file):
 8873                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8874                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8875                transcripts_sources["file"] = transcripts_from_file
 8876            else:
 8877                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8878                log.error(msg_err)
 8879                raise ValueError(msg_err)
 8880
 8881        # Get transcripts table
 8882        transcripts_table = (
 8883            param.get("calculation", {})
 8884            .get("calculations", {})
 8885            .get("NOMEN", {})
 8886            .get("options", {})
 8887            .get("transcripts_table", self.get_table_variants())
 8888        )
 8889        # Get transcripts column
 8890        transcripts_column = (
 8891            param.get("calculation", {})
 8892            .get("calculations", {})
 8893            .get("NOMEN", {})
 8894            .get("options", {})
 8895            .get("transcripts_column", None)
 8896        )
 8897
 8898        if transcripts_table and transcripts_column:
 8899            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8900            # Explode if not exists
 8901            self.explode_infos(fields=[transcripts_column], table=transcripts_table)
 8902        else:
 8903            extra_field_transcript = f"NULL"
 8904
 8905        # Transcripts of preference source order
 8906        transcripts_order = (
 8907            param.get("calculation", {})
 8908            .get("calculations", {})
 8909            .get("NOMEN", {})
 8910            .get("options", {})
 8911            .get("transcripts_order", ["column", "file"])
 8912        )
 8913
 8914        # Transcripts from file
 8915        transcripts = transcripts_sources.get("file", [])
 8916
 8917        # Explode HGVS field in column
 8918        added_columns += self.explode_infos(fields=[hgvs_field])
 8919
 8920        # extra infos
 8921        extra_infos = self.get_extra_infos()
 8922        extra_field = prefix + hgvs_field
 8923
 8924        if extra_field in extra_infos:
 8925
 8926            # Create dataframe
 8927            dataframe_hgvs = self.get_query_to_df(
 8928                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 8929            )
 8930
 8931            # Create main NOMEN column
 8932            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 8933                lambda x: find_nomen(
 8934                    hgvs=x.hgvs,
 8935                    transcript=x.transcript,
 8936                    transcripts=transcripts,
 8937                    pattern=nomen_pattern,
 8938                    transcripts_source_order=transcripts_order,
 8939                ),
 8940                axis=1,
 8941            )
 8942
 8943            # Explode NOMEN Structure and create SQL set for update
 8944            sql_nomen_fields = []
 8945            for nomen_field in nomen_dict:
 8946
 8947                # Explode each field into a column
 8948                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8949                    lambda x: dict(x).get(nomen_field, "")
 8950                )
 8951
 8952                # Create VCF header field
 8953                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8954                    nomen_field,
 8955                    ".",
 8956                    "String",
 8957                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8958                    "howard calculation",
 8959                    "0",
 8960                    self.code_type_map.get("String"),
 8961                )
 8962                sql_nomen_fields.append(
 8963                    f"""
 8964                        CASE 
 8965                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8966                            THEN concat(
 8967                                    ';{nomen_field}=',
 8968                                    dataframe_hgvs."{nomen_field}"
 8969                                )
 8970                            ELSE ''
 8971                        END
 8972                    """
 8973                )
 8974
 8975            # SQL set for update
 8976            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8977
 8978            # Update
 8979            sql_update = f"""
 8980                UPDATE variants
 8981                SET "INFO" = 
 8982                    concat(
 8983                        CASE
 8984                            WHEN "INFO" IS NULL
 8985                            THEN ''
 8986                            ELSE "INFO"
 8987                        END,
 8988                        {sql_nomen_fields_set}
 8989                    )
 8990                FROM dataframe_hgvs
 8991                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8992                    AND variants."POS" = dataframe_hgvs."POS" 
 8993                    AND variants."REF" = dataframe_hgvs."REF"
 8994                    AND variants."ALT" = dataframe_hgvs."ALT"
 8995            """
 8996            self.conn.execute(sql_update)
 8997
 8998            # Delete dataframe
 8999            del dataframe_hgvs
 9000            gc.collect()
 9001
 9002        # Remove added columns
 9003        for added_column in added_columns:
 9004            self.drop_column(column=added_column)
 9005
 9006    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9007        """
 9008        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9009        pipeline/sample for a variant and updates the variant information in a VCF file.
 9010
 9011        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9012        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9013        VCF header and to update the corresponding field in the variants table, defaults to
 9014        findbypipeline
 9015        :type tag: str (optional)
 9016        """
 9017
 9018        # if FORMAT and samples
 9019        if (
 9020            "FORMAT" in self.get_header_columns_as_list()
 9021            and self.get_header_sample_list()
 9022        ):
 9023
 9024            # findbypipeline annotation field
 9025            findbypipeline_tag = tag
 9026
 9027            # VCF infos tags
 9028            vcf_infos_tags = {
 9029                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9030            }
 9031
 9032            # Prefix
 9033            prefix = self.get_explode_infos_prefix()
 9034
 9035            # Field
 9036            findbypipeline_infos = prefix + findbypipeline_tag
 9037
 9038            # Variants table
 9039            table_variants = self.get_table_variants()
 9040
 9041            # Header
 9042            vcf_reader = self.get_header()
 9043
 9044            # Create variant id
 9045            variant_id_column = self.get_variant_id_column()
 9046            added_columns = [variant_id_column]
 9047
 9048            # variant_id, FORMAT and samples
 9049            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9050                self.get_header_sample_list()
 9051            )
 9052
 9053            # Create dataframe
 9054            dataframe_findbypipeline = self.get_query_to_df(
 9055                f""" SELECT {samples_fields} FROM {table_variants} """
 9056            )
 9057
 9058            # Create findbypipeline column
 9059            dataframe_findbypipeline[findbypipeline_infos] = (
 9060                dataframe_findbypipeline.apply(
 9061                    lambda row: findbypipeline(
 9062                        row, samples=self.get_header_sample_list()
 9063                    ),
 9064                    axis=1,
 9065                )
 9066            )
 9067
 9068            # Add snpeff_hgvs to header
 9069            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9070                findbypipeline_tag,
 9071                ".",
 9072                "String",
 9073                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9074                "howard calculation",
 9075                "0",
 9076                self.code_type_map.get("String"),
 9077            )
 9078
 9079            # Update
 9080            sql_update = f"""
 9081                UPDATE variants
 9082                SET "INFO" = 
 9083                    concat(
 9084                        CASE
 9085                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9086                            THEN ''
 9087                            ELSE concat("INFO", ';')
 9088                        END,
 9089                        CASE 
 9090                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9091                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9092                            THEN concat(
 9093                                    '{findbypipeline_tag}=',
 9094                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9095                                )
 9096                            ELSE ''
 9097                        END
 9098                    )
 9099                FROM dataframe_findbypipeline
 9100                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9101            """
 9102            self.conn.execute(sql_update)
 9103
 9104            # Remove added columns
 9105            for added_column in added_columns:
 9106                self.drop_column(column=added_column)
 9107
 9108            # Delete dataframe
 9109            del dataframe_findbypipeline
 9110            gc.collect()
 9111
 9112    def calculation_genotype_concordance(self) -> None:
 9113        """
 9114        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9115        multi-caller VCF files and updates the variant information in the database.
 9116        """
 9117
 9118        # if FORMAT and samples
 9119        if (
 9120            "FORMAT" in self.get_header_columns_as_list()
 9121            and self.get_header_sample_list()
 9122        ):
 9123
 9124            # genotypeconcordance annotation field
 9125            genotypeconcordance_tag = "genotypeconcordance"
 9126
 9127            # VCF infos tags
 9128            vcf_infos_tags = {
 9129                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9130            }
 9131
 9132            # Prefix
 9133            prefix = self.get_explode_infos_prefix()
 9134
 9135            # Field
 9136            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9137
 9138            # Variants table
 9139            table_variants = self.get_table_variants()
 9140
 9141            # Header
 9142            vcf_reader = self.get_header()
 9143
 9144            # Create variant id
 9145            variant_id_column = self.get_variant_id_column()
 9146            added_columns = [variant_id_column]
 9147
 9148            # variant_id, FORMAT and samples
 9149            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9150                self.get_header_sample_list()
 9151            )
 9152
 9153            # Create dataframe
 9154            dataframe_genotypeconcordance = self.get_query_to_df(
 9155                f""" SELECT {samples_fields} FROM {table_variants} """
 9156            )
 9157
 9158            # Create genotypeconcordance column
 9159            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9160                dataframe_genotypeconcordance.apply(
 9161                    lambda row: genotypeconcordance(
 9162                        row, samples=self.get_header_sample_list()
 9163                    ),
 9164                    axis=1,
 9165                )
 9166            )
 9167
 9168            # Add genotypeconcordance to header
 9169            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9170                genotypeconcordance_tag,
 9171                ".",
 9172                "String",
 9173                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9174                "howard calculation",
 9175                "0",
 9176                self.code_type_map.get("String"),
 9177            )
 9178
 9179            # Update
 9180            sql_update = f"""
 9181                UPDATE variants
 9182                SET "INFO" = 
 9183                    concat(
 9184                        CASE
 9185                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9186                            THEN ''
 9187                            ELSE concat("INFO", ';')
 9188                        END,
 9189                        CASE
 9190                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9191                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9192                            THEN concat(
 9193                                    '{genotypeconcordance_tag}=',
 9194                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9195                                )
 9196                            ELSE ''
 9197                        END
 9198                    )
 9199                FROM dataframe_genotypeconcordance
 9200                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9201            """
 9202            self.conn.execute(sql_update)
 9203
 9204            # Remove added columns
 9205            for added_column in added_columns:
 9206                self.drop_column(column=added_column)
 9207
 9208            # Delete dataframe
 9209            del dataframe_genotypeconcordance
 9210            gc.collect()
 9211
 9212    def calculation_barcode(self, tag: str = "barcode") -> None:
 9213        """
 9214        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9215        updates the INFO field in the file with the calculated barcode values.
 9216
 9217        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9218        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9219        the default tag name is set to "barcode", defaults to barcode
 9220        :type tag: str (optional)
 9221        """
 9222
 9223        # if FORMAT and samples
 9224        if (
 9225            "FORMAT" in self.get_header_columns_as_list()
 9226            and self.get_header_sample_list()
 9227        ):
 9228
 9229            # barcode annotation field
 9230            if not tag:
 9231                tag = "barcode"
 9232
 9233            # VCF infos tags
 9234            vcf_infos_tags = {
 9235                tag: "barcode calculation (VaRank)",
 9236            }
 9237
 9238            # Prefix
 9239            prefix = self.get_explode_infos_prefix()
 9240
 9241            # Field
 9242            barcode_infos = prefix + tag
 9243
 9244            # Variants table
 9245            table_variants = self.get_table_variants()
 9246
 9247            # Header
 9248            vcf_reader = self.get_header()
 9249
 9250            # Create variant id
 9251            variant_id_column = self.get_variant_id_column()
 9252            added_columns = [variant_id_column]
 9253
 9254            # variant_id, FORMAT and samples
 9255            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9256                self.get_header_sample_list()
 9257            )
 9258
 9259            # Create dataframe
 9260            dataframe_barcode = self.get_query_to_df(
 9261                f""" SELECT {samples_fields} FROM {table_variants} """
 9262            )
 9263
 9264            # Create barcode column
 9265            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9266                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9267            )
 9268
 9269            # Add barcode to header
 9270            vcf_reader.infos[tag] = vcf.parser._Info(
 9271                tag,
 9272                ".",
 9273                "String",
 9274                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9275                "howard calculation",
 9276                "0",
 9277                self.code_type_map.get("String"),
 9278            )
 9279
 9280            # Update
 9281            sql_update = f"""
 9282                UPDATE {table_variants}
 9283                SET "INFO" = 
 9284                    concat(
 9285                        CASE
 9286                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9287                            THEN ''
 9288                            ELSE concat("INFO", ';')
 9289                        END,
 9290                        CASE
 9291                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9292                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9293                            THEN concat(
 9294                                    '{tag}=',
 9295                                    dataframe_barcode."{barcode_infos}"
 9296                                )
 9297                            ELSE ''
 9298                        END
 9299                    )
 9300                FROM dataframe_barcode
 9301                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9302            """
 9303            self.conn.execute(sql_update)
 9304
 9305            # Remove added columns
 9306            for added_column in added_columns:
 9307                self.drop_column(column=added_column)
 9308
 9309            # Delete dataframe
 9310            del dataframe_barcode
 9311            gc.collect()
 9312
 9313    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9314        """
 9315        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9316        and updates the INFO field in the file with the calculated barcode values.
 9317
 9318        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9319        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9320        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9321        :type tag: str (optional)
 9322        """
 9323
 9324        # if FORMAT and samples
 9325        if (
 9326            "FORMAT" in self.get_header_columns_as_list()
 9327            and self.get_header_sample_list()
 9328        ):
 9329
 9330            # barcode annotation field
 9331            if not tag:
 9332                tag = "BCF"
 9333
 9334            # VCF infos tags
 9335            vcf_infos_tags = {
 9336                tag: "barcode family calculation",
 9337                f"{tag}S": "barcode family samples",
 9338            }
 9339
 9340            # Param
 9341            param = self.get_param()
 9342            log.debug(f"param={param}")
 9343
 9344            # Prefix
 9345            prefix = self.get_explode_infos_prefix()
 9346
 9347            # PED param
 9348            ped = (
 9349                param.get("calculation", {})
 9350                .get("calculations", {})
 9351                .get("BARCODEFAMILY", {})
 9352                .get("family_pedigree", None)
 9353            )
 9354            log.debug(f"ped={ped}")
 9355
 9356            # Load PED
 9357            if ped:
 9358
 9359                # Pedigree is a file
 9360                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9361                    log.debug("Pedigree is file")
 9362                    with open(full_path(ped)) as ped:
 9363                        ped = json.load(ped)
 9364
 9365                # Pedigree is a string
 9366                elif isinstance(ped, str):
 9367                    log.debug("Pedigree is str")
 9368                    try:
 9369                        ped = json.loads(ped)
 9370                        log.debug("Pedigree is json str")
 9371                    except ValueError as e:
 9372                        ped_samples = ped.split(",")
 9373                        ped = {}
 9374                        for ped_sample in ped_samples:
 9375                            ped[ped_sample] = ped_sample
 9376
 9377                # Pedigree is a dict
 9378                elif isinstance(ped, dict):
 9379                    log.debug("Pedigree is dict")
 9380
 9381                # Pedigree is not well formatted
 9382                else:
 9383                    msg_error = "Pedigree not well formatted"
 9384                    log.error(msg_error)
 9385                    raise ValueError(msg_error)
 9386
 9387                # Construct list
 9388                ped_samples = list(ped.values())
 9389
 9390            else:
 9391                log.debug("Pedigree not defined. Take all samples")
 9392                ped_samples = self.get_header_sample_list()
 9393                ped = {}
 9394                for ped_sample in ped_samples:
 9395                    ped[ped_sample] = ped_sample
 9396
 9397            # Check pedigree
 9398            if not ped or len(ped) == 0:
 9399                msg_error = f"Error in pedigree: samples {ped_samples}"
 9400                log.error(msg_error)
 9401                raise ValueError(msg_error)
 9402
 9403            # Log
 9404            log.info(
 9405                "Calculation 'BARCODEFAMILY' - Samples: "
 9406                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9407            )
 9408            log.debug(f"ped_samples={ped_samples}")
 9409
 9410            # Field
 9411            barcode_infos = prefix + tag
 9412
 9413            # Variants table
 9414            table_variants = self.get_table_variants()
 9415
 9416            # Header
 9417            vcf_reader = self.get_header()
 9418
 9419            # Create variant id
 9420            variant_id_column = self.get_variant_id_column()
 9421            added_columns = [variant_id_column]
 9422
 9423            # variant_id, FORMAT and samples
 9424            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9425                ped_samples
 9426            )
 9427
 9428            # Create dataframe
 9429            dataframe_barcode = self.get_query_to_df(
 9430                f""" SELECT {samples_fields} FROM {table_variants} """
 9431            )
 9432
 9433            # Create barcode column
 9434            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9435                lambda row: barcode(row, samples=ped_samples), axis=1
 9436            )
 9437
 9438            # Add barcode family to header
 9439            # Add vaf_normalization to header
 9440            vcf_reader.formats[tag] = vcf.parser._Format(
 9441                id=tag,
 9442                num=".",
 9443                type="String",
 9444                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9445                type_code=self.code_type_map.get("String"),
 9446            )
 9447            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9448                id=f"{tag}S",
 9449                num=".",
 9450                type="String",
 9451                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9452                type_code=self.code_type_map.get("String"),
 9453            )
 9454
 9455            # Update
 9456            # for sample in ped_samples:
 9457            sql_update_set = []
 9458            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9459                if sample in ped_samples:
 9460                    value = f'dataframe_barcode."{barcode_infos}"'
 9461                    value_samples = "'" + ",".join(ped_samples) + "'"
 9462                elif sample == "FORMAT":
 9463                    value = f"'{tag}'"
 9464                    value_samples = f"'{tag}S'"
 9465                else:
 9466                    value = "'.'"
 9467                    value_samples = "'.'"
 9468                format_regex = r"[a-zA-Z0-9\s]"
 9469                sql_update_set.append(
 9470                    f"""
 9471                        "{sample}" = 
 9472                        concat(
 9473                            CASE
 9474                                WHEN {table_variants}."{sample}" = './.'
 9475                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9476                                ELSE {table_variants}."{sample}"
 9477                            END,
 9478                            ':',
 9479                            {value},
 9480                            ':',
 9481                            {value_samples}
 9482                        )
 9483                    """
 9484                )
 9485
 9486            sql_update_set_join = ", ".join(sql_update_set)
 9487            sql_update = f"""
 9488                UPDATE {table_variants}
 9489                SET {sql_update_set_join}
 9490                FROM dataframe_barcode
 9491                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9492            """
 9493            self.conn.execute(sql_update)
 9494
 9495            # Remove added columns
 9496            for added_column in added_columns:
 9497                self.drop_column(column=added_column)
 9498
 9499            # Delete dataframe
 9500            del dataframe_barcode
 9501            gc.collect()
 9502
 9503    def calculation_trio(self) -> None:
 9504        """
 9505        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9506        information to the INFO field of each variant.
 9507        """
 9508
 9509        # if FORMAT and samples
 9510        if (
 9511            "FORMAT" in self.get_header_columns_as_list()
 9512            and self.get_header_sample_list()
 9513        ):
 9514
 9515            # trio annotation field
 9516            trio_tag = "trio"
 9517
 9518            # VCF infos tags
 9519            vcf_infos_tags = {
 9520                "trio": "trio calculation",
 9521            }
 9522
 9523            # Param
 9524            param = self.get_param()
 9525
 9526            # Prefix
 9527            prefix = self.get_explode_infos_prefix()
 9528
 9529            # Trio param
 9530            trio_ped = (
 9531                param.get("calculation", {})
 9532                .get("calculations", {})
 9533                .get("TRIO", {})
 9534                .get("trio_pedigree", None)
 9535            )
 9536
 9537            # Load trio
 9538            if trio_ped:
 9539
 9540                # Trio pedigree is a file
 9541                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9542                    log.debug("TRIO pedigree is file")
 9543                    with open(full_path(trio_ped)) as trio_ped:
 9544                        trio_ped = json.load(trio_ped)
 9545
 9546                # Trio pedigree is a string
 9547                elif isinstance(trio_ped, str):
 9548                    log.debug("TRIO pedigree is str")
 9549                    try:
 9550                        trio_ped = json.loads(trio_ped)
 9551                        log.debug("TRIO pedigree is json str")
 9552                    except ValueError as e:
 9553                        trio_samples = trio_ped.split(",")
 9554                        if len(trio_samples) == 3:
 9555                            trio_ped = {
 9556                                "father": trio_samples[0],
 9557                                "mother": trio_samples[1],
 9558                                "child": trio_samples[2],
 9559                            }
 9560                            log.debug("TRIO pedigree is list str")
 9561                        else:
 9562                            msg_error = "TRIO pedigree not well formatted"
 9563                            log.error(msg_error)
 9564                            raise ValueError(msg_error)
 9565
 9566                # Trio pedigree is a dict
 9567                elif isinstance(trio_ped, dict):
 9568                    log.debug("TRIO pedigree is dict")
 9569
 9570                # Trio pedigree is not well formatted
 9571                else:
 9572                    msg_error = "TRIO pedigree not well formatted"
 9573                    log.error(msg_error)
 9574                    raise ValueError(msg_error)
 9575
 9576                # Construct trio list
 9577                trio_samples = [
 9578                    trio_ped.get("father", ""),
 9579                    trio_ped.get("mother", ""),
 9580                    trio_ped.get("child", ""),
 9581                ]
 9582
 9583            else:
 9584                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9585                samples_list = self.get_header_sample_list()
 9586                if len(samples_list) >= 3:
 9587                    trio_samples = self.get_header_sample_list()[0:3]
 9588                    trio_ped = {
 9589                        "father": trio_samples[0],
 9590                        "mother": trio_samples[1],
 9591                        "child": trio_samples[2],
 9592                    }
 9593                else:
 9594                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9595                    log.error(msg_error)
 9596                    raise ValueError(msg_error)
 9597
 9598            # Check trio pedigree
 9599            if not trio_ped or len(trio_ped) != 3:
 9600                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9601                log.error(msg_error)
 9602                raise ValueError(msg_error)
 9603
 9604            # Log
 9605            log.info(
 9606                f"Calculation 'TRIO' - Samples: "
 9607                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9608            )
 9609
 9610            # Field
 9611            trio_infos = prefix + trio_tag
 9612
 9613            # Variants table
 9614            table_variants = self.get_table_variants()
 9615
 9616            # Header
 9617            vcf_reader = self.get_header()
 9618
 9619            # Create variant id
 9620            variant_id_column = self.get_variant_id_column()
 9621            added_columns = [variant_id_column]
 9622
 9623            # variant_id, FORMAT and samples
 9624            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9625                self.get_header_sample_list()
 9626            )
 9627
 9628            # Create dataframe
 9629            dataframe_trio = self.get_query_to_df(
 9630                f""" SELECT {samples_fields} FROM {table_variants} """
 9631            )
 9632
 9633            # Create trio column
 9634            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9635                lambda row: trio(row, samples=trio_samples), axis=1
 9636            )
 9637
 9638            # Add trio to header
 9639            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9640                trio_tag,
 9641                ".",
 9642                "String",
 9643                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9644                "howard calculation",
 9645                "0",
 9646                self.code_type_map.get("String"),
 9647            )
 9648
 9649            # Update
 9650            sql_update = f"""
 9651                UPDATE {table_variants}
 9652                SET "INFO" = 
 9653                    concat(
 9654                        CASE
 9655                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9656                            THEN ''
 9657                            ELSE concat("INFO", ';')
 9658                        END,
 9659                        CASE
 9660                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9661                             AND dataframe_trio."{trio_infos}" NOT NULL
 9662                            THEN concat(
 9663                                    '{trio_tag}=',
 9664                                    dataframe_trio."{trio_infos}"
 9665                                )
 9666                            ELSE ''
 9667                        END
 9668                    )
 9669                FROM dataframe_trio
 9670                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9671            """
 9672            self.conn.execute(sql_update)
 9673
 9674            # Remove added columns
 9675            for added_column in added_columns:
 9676                self.drop_column(column=added_column)
 9677
 9678            # Delete dataframe
 9679            del dataframe_trio
 9680            gc.collect()
 9681
 9682    def calculation_vaf_normalization(self) -> None:
 9683        """
 9684        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9685        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9686        :return: The function does not return anything.
 9687        """
 9688
 9689        # if FORMAT and samples
 9690        if (
 9691            "FORMAT" in self.get_header_columns_as_list()
 9692            and self.get_header_sample_list()
 9693        ):
 9694
 9695            # vaf_normalization annotation field
 9696            vaf_normalization_tag = "VAF"
 9697
 9698            # VCF infos tags
 9699            vcf_infos_tags = {
 9700                "VAF": "VAF Variant Frequency",
 9701            }
 9702
 9703            # Prefix
 9704            prefix = self.get_explode_infos_prefix()
 9705
 9706            # Variants table
 9707            table_variants = self.get_table_variants()
 9708
 9709            # Header
 9710            vcf_reader = self.get_header()
 9711
 9712            # Do not calculate if VAF already exists
 9713            if "VAF" in vcf_reader.formats:
 9714                log.debug("VAF already on genotypes")
 9715                return
 9716
 9717            # Create variant id
 9718            variant_id_column = self.get_variant_id_column()
 9719            added_columns = [variant_id_column]
 9720
 9721            # variant_id, FORMAT and samples
 9722            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9723                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9724            )
 9725
 9726            # Create dataframe
 9727            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9728            log.debug(f"query={query}")
 9729            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9730
 9731            vaf_normalization_set = []
 9732
 9733            # for each sample vaf_normalization
 9734            for sample in self.get_header_sample_list():
 9735                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9736                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9737                )
 9738                vaf_normalization_set.append(
 9739                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9740                )
 9741
 9742            # Add VAF to FORMAT
 9743            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9744                "FORMAT"
 9745            ].apply(lambda x: str(x) + ":VAF")
 9746            vaf_normalization_set.append(
 9747                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9748            )
 9749
 9750            # Add vaf_normalization to header
 9751            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9752                id=vaf_normalization_tag,
 9753                num="1",
 9754                type="Float",
 9755                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9756                type_code=self.code_type_map.get("Float"),
 9757            )
 9758
 9759            # Create fields to add in INFO
 9760            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9761
 9762            # Update
 9763            sql_update = f"""
 9764                UPDATE {table_variants}
 9765                SET {sql_vaf_normalization_set}
 9766                FROM dataframe_vaf_normalization
 9767                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9768
 9769            """
 9770            self.conn.execute(sql_update)
 9771
 9772            # Remove added columns
 9773            for added_column in added_columns:
 9774                self.drop_column(column=added_column)
 9775
 9776            # Delete dataframe
 9777            del dataframe_vaf_normalization
 9778            gc.collect()
 9779
 9780    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9781        """
 9782        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9783        field in a VCF file and updates the INFO column of the variants table with the calculated
 9784        statistics.
 9785
 9786        :param info: The `info` parameter is a string that represents the type of information for which
 9787        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9788        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9789        maximum value, the mean, the median, defaults to VAF
 9790        :type info: str (optional)
 9791        """
 9792
 9793        # if FORMAT and samples
 9794        if (
 9795            "FORMAT" in self.get_header_columns_as_list()
 9796            and self.get_header_sample_list()
 9797        ):
 9798
 9799            # vaf_stats annotation field
 9800            vaf_stats_tag = info + "_stats"
 9801
 9802            # VCF infos tags
 9803            vcf_infos_tags = {
 9804                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9805                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9806                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9807                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9808                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9809                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9810                info
 9811                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9812            }
 9813
 9814            # Prefix
 9815            prefix = self.get_explode_infos_prefix()
 9816
 9817            # Field
 9818            vaf_stats_infos = prefix + vaf_stats_tag
 9819
 9820            # Variants table
 9821            table_variants = self.get_table_variants()
 9822
 9823            # Header
 9824            vcf_reader = self.get_header()
 9825
 9826            # Create variant id
 9827            variant_id_column = self.get_variant_id_column()
 9828            added_columns = [variant_id_column]
 9829
 9830            # variant_id, FORMAT and samples
 9831            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9832                self.get_header_sample_list()
 9833            )
 9834
 9835            # Create dataframe
 9836            dataframe_vaf_stats = self.get_query_to_df(
 9837                f""" SELECT {samples_fields} FROM {table_variants} """
 9838            )
 9839
 9840            # Create vaf_stats column
 9841            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9842                lambda row: genotype_stats(
 9843                    row, samples=self.get_header_sample_list(), info=info
 9844                ),
 9845                axis=1,
 9846            )
 9847
 9848            # List of vcf tags
 9849            sql_vaf_stats_fields = []
 9850
 9851            # Check all VAF stats infos
 9852            for stat in vcf_infos_tags:
 9853
 9854                # Extract stats
 9855                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9856                    lambda x: dict(x).get(stat, "")
 9857                )
 9858
 9859                # Add snpeff_hgvs to header
 9860                vcf_reader.infos[stat] = vcf.parser._Info(
 9861                    stat,
 9862                    ".",
 9863                    "String",
 9864                    vcf_infos_tags.get(stat, "genotype statistics"),
 9865                    "howard calculation",
 9866                    "0",
 9867                    self.code_type_map.get("String"),
 9868                )
 9869
 9870                if len(sql_vaf_stats_fields):
 9871                    sep = ";"
 9872                else:
 9873                    sep = ""
 9874
 9875                # Create fields to add in INFO
 9876                sql_vaf_stats_fields.append(
 9877                    f"""
 9878                        CASE
 9879                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9880                            THEN concat(
 9881                                    '{sep}{stat}=',
 9882                                    dataframe_vaf_stats."{stat}"
 9883                                )
 9884                            ELSE ''
 9885                        END
 9886                    """
 9887                )
 9888
 9889            # SQL set for update
 9890            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9891
 9892            # Update
 9893            sql_update = f"""
 9894                UPDATE {table_variants}
 9895                SET "INFO" = 
 9896                    concat(
 9897                        CASE
 9898                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9899                            THEN ''
 9900                            ELSE concat("INFO", ';')
 9901                        END,
 9902                        {sql_vaf_stats_fields_set}
 9903                    )
 9904                FROM dataframe_vaf_stats
 9905                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9906
 9907            """
 9908            self.conn.execute(sql_update)
 9909
 9910            # Remove added columns
 9911            for added_column in added_columns:
 9912                self.drop_column(column=added_column)
 9913
 9914            # Delete dataframe
 9915            del dataframe_vaf_stats
 9916            gc.collect()
 9917
 9918    def calculation_transcripts_annotation(
 9919        self, info_json: str = None, info_format: str = None
 9920    ) -> None:
 9921        """
 9922        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9923        field to it if transcripts are available.
 9924
 9925        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9926        is a string parameter that represents the information field to be used in the transcripts JSON.
 9927        It is used to specify the JSON format for the transcripts information. If no value is provided
 9928        when calling the method, it defaults to "
 9929        :type info_json: str
 9930        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9931        method is a string parameter that specifies the format of the information field to be used in
 9932        the transcripts JSON. It is used to define the format of the information field
 9933        :type info_format: str
 9934        """
 9935
 9936        # Create transcripts table
 9937        transcripts_table = self.create_transcript_view()
 9938
 9939        # Add info field
 9940        if transcripts_table:
 9941            self.transcript_view_to_variants(
 9942                transcripts_table=transcripts_table,
 9943                transcripts_info_field_json=info_json,
 9944                transcripts_info_field_format=info_format,
 9945            )
 9946        else:
 9947            log.info("No Transcripts to process. Check param.json file configuration")
 9948
 9949    def calculation_transcripts_prioritization(self) -> None:
 9950        """
 9951        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9952        prioritizes transcripts based on certain criteria.
 9953        """
 9954
 9955        # Create transcripts table
 9956        transcripts_table = self.create_transcript_view()
 9957
 9958        # Add info field
 9959        if transcripts_table:
 9960            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9961        else:
 9962            log.info("No Transcripts to process. Check param.json file configuration")
 9963
 9964    def calculation_transcripts_export(self) -> None:
 9965        """ """
 9966
 9967        # Create transcripts table
 9968        transcripts_table = self.create_transcript_view()
 9969
 9970        # Add info field
 9971        if transcripts_table:
 9972            self.transcripts_export(transcripts_table=transcripts_table)
 9973        else:
 9974            log.info("No Transcripts to process. Check param.json file configuration")
 9975
 9976    ###############
 9977    # Transcripts #
 9978    ###############
 9979
 9980    def transcripts_export(
 9981        self, transcripts_table: str = None, param: dict = {}
 9982    ) -> bool:
 9983        """ """
 9984
 9985        log.debug("Start transcripts export...")
 9986
 9987        # Param
 9988        if not param:
 9989            param = self.get_param()
 9990
 9991        # Param export
 9992        param_transcript_export = param.get("transcripts", {}).get("export", {})
 9993
 9994        # Output file
 9995        transcripts_export_output = param_transcript_export.get("output", None)
 9996
 9997        if not param_transcript_export or not transcripts_export_output:
 9998            log.warning(f"No transcriipts export parameters defined!")
 9999            return False
10000
10001        # List of transcripts annotations
10002        query_describe = f"""
10003            SELECT column_name
10004            FROM (
10005                    DESCRIBE SELECT * FROM {transcripts_table}
10006                )
10007            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10008        """
10009        transcripts_annotations_list = list(
10010            self.get_query_to_df(query=query_describe)["column_name"]
10011        )
10012
10013        # Create transcripts table for export
10014        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10015            random.choices(string.ascii_uppercase + string.digits, k=10)
10016        )
10017        query_create_transcripts_table_export = f"""
10018            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10019        """
10020        self.execute_query(query=query_create_transcripts_table_export)
10021
10022        # Output file format
10023        transcripts_export_output_format = get_file_format(
10024            filename=transcripts_export_output
10025        )
10026
10027        # Format VCF - construct INFO
10028        if transcripts_export_output_format in ["vcf"]:
10029
10030            # Construct query update INFO and header
10031            query_update_info = []
10032            for field in transcripts_annotations_list:
10033
10034                # If field not in header
10035                if field not in self.get_header_infos_list():
10036
10037                    # Add PZ Transcript in header
10038                    self.get_header().infos[field] = vcf.parser._Info(
10039                        field,
10040                        ".",
10041                        "String",
10042                        f"Annotation '{field}' from transcript view",
10043                        "unknown",
10044                        "unknown",
10045                        0,
10046                    )
10047
10048                # Add field as INFO/tag
10049                query_update_info.append(
10050                    f"""
10051                        CASE
10052                            WHEN "{field}" IS NOT NULL
10053                            THEN concat('{field}=', "{field}", ';')    
10054                            ELSE ''     
10055                        END
10056                        """
10057                )
10058
10059            # Query param
10060            query_update_info_value = (
10061                f""" concat('',  {", ".join(query_update_info)}) """
10062            )
10063            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10064
10065        else:
10066
10067            # Query param
10068            query_update_info_value = f""" NULL """
10069            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10070
10071        # Update query INFO column
10072        query_update = f"""
10073            UPDATE {transcripts_table_export}
10074            SET INFO = {query_update_info_value}
10075
10076        """
10077        self.execute_query(query=query_update)
10078
10079        # Export
10080        self.export_output(
10081            output_file=transcripts_export_output,
10082            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10083        )
10084
10085        # Drop transcripts export table
10086        query_drop_transcripts_table_export = f"""
10087            DROP TABLE {transcripts_table_export}
10088        """
10089        self.execute_query(query=query_drop_transcripts_table_export)
10090
10091    def transcripts_prioritization(
10092        self, transcripts_table: str = None, param: dict = {}
10093    ) -> bool:
10094        """
10095        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10096        and updates the variants table with the prioritized information.
10097
10098        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10099        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10100        This parameter is used to identify the table where the transcripts data is stored for the
10101        prioritization process
10102        :type transcripts_table: str
10103        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10104        that contains various configuration settings for the prioritization process of transcripts. It
10105        is used to customize the behavior of the prioritization algorithm and includes settings such as
10106        the prefix for prioritization fields, default profiles, and other
10107        :type param: dict
10108        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10109        transcripts prioritization process is successfully completed, and `False` if there are any
10110        issues or if no profile is defined for transcripts prioritization.
10111        """
10112
10113        log.debug("Start transcripts prioritization...")
10114
10115        # Param
10116        if not param:
10117            param = self.get_param()
10118
10119        # Variants table
10120        table_variants = self.get_table_variants()
10121
10122        # Transcripts table
10123        if transcripts_table is None:
10124            transcripts_table = self.create_transcript_view(
10125                transcripts_table="transcripts", param=param
10126            )
10127        if transcripts_table is None:
10128            msg_err = "No Transcripts table availalble"
10129            log.error(msg_err)
10130            raise ValueError(msg_err)
10131        log.debug(f"transcripts_table={transcripts_table}")
10132
10133        # Get transcripts columns
10134        columns_as_list_query = f"""
10135            DESCRIBE {transcripts_table}
10136        """
10137        columns_as_list = list(
10138            self.get_query_to_df(columns_as_list_query)["column_name"]
10139        )
10140
10141        # Create INFO if not exists
10142        if "INFO" not in columns_as_list:
10143            query_add_info = f"""
10144                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10145            """
10146            self.execute_query(query_add_info)
10147
10148        # Prioritization param and Force only PZ Score and Flag
10149        pz_param = param.get("transcripts", {}).get("prioritization", {})
10150
10151        # PZ profile by default
10152        pz_profile_default = (
10153            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10154        )
10155
10156        # Exit if no profile
10157        if pz_profile_default is None:
10158            log.warning("No profile defined for transcripts prioritization")
10159            return False
10160
10161        # PZ fields
10162        pz_param_pzfields = {}
10163
10164        # PZ field transcripts
10165        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10166
10167        # Add PZ Transcript in header
10168        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10169            pz_fields_transcripts,
10170            ".",
10171            "String",
10172            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10173            "unknown",
10174            "unknown",
10175            code_type_map["String"],
10176        )
10177
10178        # Mandatory fields
10179        pz_mandatory_fields_list = [
10180            "Score",
10181            "Flag",
10182            "Tags",
10183            "Comment",
10184            "Infos",
10185            "Class",
10186        ]
10187        pz_mandatory_fields = []
10188        for pz_mandatory_field in pz_mandatory_fields_list:
10189            pz_mandatory_fields.append(
10190                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10191            )
10192
10193        # PZ fields in param
10194        for pz_field in pz_param.get("pzfields", []):
10195            if pz_field in pz_mandatory_fields_list:
10196                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10197                    pz_param.get("pzprefix", "PTZ") + pz_field
10198                )
10199            else:
10200                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10201                pz_param_pzfields[pz_field] = pz_field_new
10202
10203                # Add PZ Transcript in header
10204                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10205                    pz_field_new,
10206                    ".",
10207                    "String",
10208                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10209                    "unknown",
10210                    "unknown",
10211                    code_type_map["String"],
10212                )
10213
10214        # PZ fields param
10215        pz_param["pzfields"] = pz_mandatory_fields
10216
10217        # Prioritization
10218        prioritization_result = self.prioritization(
10219            table=transcripts_table,
10220            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10221        )
10222        if not prioritization_result:
10223            log.warning("Transcripts prioritization not processed")
10224            return False
10225
10226        # PZ fields sql query
10227        query_update_select_list = []
10228        query_update_concat_list = []
10229        query_update_order_list = []
10230        for pz_param_pzfield in set(
10231            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10232        ):
10233            query_update_select_list.append(f" {pz_param_pzfield}, ")
10234
10235        for pz_param_pzfield in pz_param_pzfields:
10236            query_update_concat_list.append(
10237                f"""
10238                    , CASE 
10239                        WHEN {pz_param_pzfield} IS NOT NULL
10240                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10241                        ELSE ''
10242                    END
10243                """
10244            )
10245
10246        # Order by
10247        pz_orders = (
10248            param.get("transcripts", {})
10249            .get("prioritization", {})
10250            .get("prioritization_transcripts_order", {})
10251        )
10252        if not pz_orders:
10253            pz_orders = {
10254                pz_param.get("pzprefix", "PTZ") + "Flag": "ASC",
10255                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10256            }
10257        for pz_order in pz_orders:
10258            query_update_order_list.append(
10259                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10260            )
10261
10262        # Fields to explode
10263        fields_to_explode = (
10264            list(pz_param_pzfields.keys())
10265            + pz_mandatory_fields
10266            + list(pz_orders.keys())
10267        )
10268        # Remove transcript column as a specific transcript column
10269        if "transcript" in fields_to_explode:
10270            fields_to_explode.remove("transcript")
10271
10272        # Fields intranscripts table
10273        query_transcripts_table = f"""
10274            DESCRIBE SELECT * FROM {transcripts_table}
10275        """
10276        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10277
10278        # Check fields to explode
10279        for field_to_explode in fields_to_explode:
10280            if field_to_explode not in self.get_header_infos_list() + list(
10281                query_transcripts_table.column_name
10282            ):
10283                msg_err = f"INFO/{field_to_explode} NOT IN header"
10284                log.error(msg_err)
10285                raise ValueError(msg_err)
10286
10287        # Explode fields to explode
10288        self.explode_infos(
10289            table=transcripts_table,
10290            fields=fields_to_explode,
10291        )
10292
10293        # Transcript preference file
10294        transcripts_preference_file = (
10295            param.get("transcripts", {})
10296            .get("prioritization", {})
10297            .get("prioritization_transcripts", {})
10298        )
10299        transcripts_preference_file = full_path(transcripts_preference_file)
10300
10301        # Transcript preference forced
10302        transcript_preference_force = (
10303            param.get("transcripts", {})
10304            .get("prioritization", {})
10305            .get("prioritization_transcripts_force", False)
10306        )
10307        # Transcript version forced
10308        transcript_version_force = (
10309            param.get("transcripts", {})
10310            .get("prioritization", {})
10311            .get("prioritization_transcripts_version_force", False)
10312        )
10313
10314        # Transcripts Ranking
10315        if transcripts_preference_file:
10316
10317            # Transcripts file to dataframe
10318            if os.path.exists(transcripts_preference_file):
10319                transcripts_preference_dataframe = transcripts_file_to_df(
10320                    transcripts_preference_file
10321                )
10322            else:
10323                log.error(
10324                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10325                )
10326                raise ValueError(
10327                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10328                )
10329
10330            # Order by depending to transcript preference forcing
10331            if transcript_preference_force:
10332                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10333            else:
10334                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10335
10336            # Transcript columns joined depend on version consideration
10337            if transcript_version_force:
10338                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10339            else:
10340                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10341
10342            # Query ranking for update
10343            query_update_ranking = f"""
10344                SELECT
10345                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10346                    ROW_NUMBER() OVER (
10347                        PARTITION BY "#CHROM", POS, REF, ALT
10348                        ORDER BY {order_by}
10349                    ) AS rn
10350                FROM {transcripts_table}
10351                LEFT JOIN 
10352                    (
10353                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10354                        FROM transcripts_preference_dataframe
10355                    ) AS transcripts_preference
10356                ON {transcripts_version_join}
10357            """
10358
10359        else:
10360
10361            # Query ranking for update
10362            query_update_ranking = f"""
10363                SELECT
10364                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10365                    ROW_NUMBER() OVER (
10366                        PARTITION BY "#CHROM", POS, REF, ALT
10367                        ORDER BY {" , ".join(query_update_order_list)}
10368                    ) AS rn
10369                FROM {transcripts_table}
10370            """
10371
10372        # Export Transcripts prioritization infos to variants table
10373        query_update = f"""
10374            WITH RankedTranscripts AS (
10375                {query_update_ranking}
10376            )
10377            UPDATE {table_variants}
10378                SET
10379                INFO = CONCAT(CASE
10380                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10381                            THEN ''
10382                            ELSE concat("INFO", ';')
10383                        END,
10384                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10385                        )
10386            FROM
10387                RankedTranscripts
10388            WHERE
10389                rn = 1
10390                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10391                AND variants."POS" = RankedTranscripts."POS"
10392                AND variants."REF" = RankedTranscripts."REF"
10393                AND variants."ALT" = RankedTranscripts."ALT"     
10394        """
10395
10396        # log.debug(f"query_update={query_update}")
10397        self.execute_query(query=query_update)
10398
10399        # Return
10400        return True
10401
10402    def create_transcript_view_from_columns_map(
10403        self,
10404        transcripts_table: str = "transcripts",
10405        columns_maps: dict = {},
10406        added_columns: list = [],
10407        temporary_tables: list = None,
10408        annotation_fields: list = None,
10409        column_rename: dict = {},
10410        column_clean: bool = False,
10411        column_case: str = None,
10412    ) -> tuple[list, list, list]:
10413        """
10414        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10415        specified columns mapping for transcripts data.
10416
10417        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10418        of the table where the transcripts data is stored or will be stored in the database. This table
10419        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10420        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10421        :type transcripts_table: str (optional)
10422        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10423        about how to map columns from a transcripts table to create a view. Each entry in the
10424        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10425        typically includes details such as the main transcript column and additional information columns
10426        :type columns_maps: dict
10427        :param added_columns: The `added_columns` parameter in the
10428        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10429        that will be added to the view being created based on the columns map provided. These columns
10430        are generated by exploding the transcript information columns along with the main transcript
10431        column
10432        :type added_columns: list
10433        :param temporary_tables: The `temporary_tables` parameter in the
10434        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10435        tables created during the process of creating a transcript view from a columns map. These
10436        temporary tables are used to store intermediate results or transformations before the final view
10437        is generated
10438        :type temporary_tables: list
10439        :param annotation_fields: The `annotation_fields` parameter in the
10440        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10441        used for annotation in the query view creation process. These fields are extracted from the
10442        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10443        :type annotation_fields: list
10444        :param column_rename: The `column_rename` parameter in the
10445        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10446        custom renaming for columns during the creation of the temporary table view. This parameter
10447        provides a mapping of original column names to the desired renamed column names. By using this
10448        parameter,
10449        :type column_rename: dict
10450        :param column_clean: The `column_clean` parameter in the
10451        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10452        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10453        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10454        False
10455        :type column_clean: bool (optional)
10456        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10457        function is used to specify the case transformation to be applied to the columns during the view
10458        creation process. It allows you to control whether the column values should be converted to
10459        lowercase, uppercase, or remain unchanged
10460        :type column_case: str
10461        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10462        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10463        """
10464
10465        log.debug("Start transcrpts view creation from columns map...")
10466
10467        # "from_columns_map": [
10468        #     {
10469        #         "transcripts_column": "Ensembl_transcriptid",
10470        #         "transcripts_infos_columns": [
10471        #             "genename",
10472        #             "Ensembl_geneid",
10473        #             "LIST_S2_score",
10474        #             "LIST_S2_pred",
10475        #         ],
10476        #     },
10477        #     {
10478        #         "transcripts_column": "Ensembl_transcriptid",
10479        #         "transcripts_infos_columns": [
10480        #             "genename",
10481        #             "VARITY_R_score",
10482        #             "Aloft_pred",
10483        #         ],
10484        #     },
10485        # ],
10486
10487        # Init
10488        if temporary_tables is None:
10489            temporary_tables = []
10490        if annotation_fields is None:
10491            annotation_fields = []
10492
10493        # Variants table
10494        table_variants = self.get_table_variants()
10495
10496        for columns_map in columns_maps:
10497
10498            # Transcript column
10499            transcripts_column = columns_map.get("transcripts_column", None)
10500
10501            # Transcripts infos columns
10502            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10503
10504            # Transcripts infos columns rename
10505            column_rename = columns_map.get("column_rename", column_rename)
10506
10507            # Transcripts infos columns clean
10508            column_clean = columns_map.get("column_clean", column_clean)
10509
10510            # Transcripts infos columns case
10511            column_case = columns_map.get("column_case", column_case)
10512
10513            if transcripts_column is not None:
10514
10515                # Explode
10516                added_columns += self.explode_infos(
10517                    fields=[transcripts_column] + transcripts_infos_columns
10518                )
10519
10520                # View clauses
10521                clause_select_variants = []
10522                clause_select_tanscripts = []
10523                for field in [transcripts_column] + transcripts_infos_columns:
10524
10525                    # AS field
10526                    as_field = field
10527
10528                    # Rename
10529                    if column_rename:
10530                        as_field = column_rename.get(as_field, as_field)
10531
10532                    # Clean
10533                    if column_clean:
10534                        as_field = clean_annotation_field(as_field)
10535
10536                    # Case
10537                    if column_case:
10538                        if column_case.lower() in ["lower"]:
10539                            as_field = as_field.lower()
10540                        elif column_case.lower() in ["upper"]:
10541                            as_field = as_field.upper()
10542
10543                    # Clause select Variants
10544                    clause_select_variants.append(
10545                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10546                    )
10547
10548                    if field in [transcripts_column]:
10549                        clause_select_tanscripts.append(
10550                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10551                        )
10552                    else:
10553                        clause_select_tanscripts.append(
10554                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10555                        )
10556                        annotation_fields.append(as_field)
10557
10558                # Querey View
10559                query = f""" 
10560                    SELECT
10561                        "#CHROM", POS, REF, ALT, INFO,
10562                        "{transcripts_column}" AS 'transcript',
10563                        {", ".join(clause_select_tanscripts)}
10564                    FROM (
10565                        SELECT 
10566                            "#CHROM", POS, REF, ALT, INFO,
10567                            {", ".join(clause_select_variants)}
10568                        FROM {table_variants}
10569                        )
10570                    WHERE "{transcripts_column}" IS NOT NULL
10571                """
10572
10573                # Create temporary table
10574                temporary_table = transcripts_table + "".join(
10575                    random.choices(string.ascii_uppercase + string.digits, k=10)
10576                )
10577
10578                # Temporary_tables
10579                temporary_tables.append(temporary_table)
10580                query_view = f"""
10581                    CREATE TEMPORARY TABLE {temporary_table}
10582                    AS ({query})
10583                """
10584                self.execute_query(query=query_view)
10585
10586        return added_columns, temporary_tables, annotation_fields
10587
10588    def create_transcript_view_from_column_format(
10589        self,
10590        transcripts_table: str = "transcripts",
10591        column_formats: dict = {},
10592        temporary_tables: list = None,
10593        annotation_fields: list = None,
10594        column_rename: dict = {},
10595        column_clean: bool = False,
10596        column_case: str = None,
10597    ) -> tuple[list, list, list]:
10598        """
10599        The `create_transcript_view_from_column_format` function generates a transcript view based on
10600        specified column formats, adds additional columns and annotation fields, and returns the list of
10601        temporary tables and annotation fields.
10602
10603        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10604        of the table containing the transcripts data. This table will be used as the base table for
10605        creating the transcript view. The default value for this parameter is "transcripts", but you can
10606        provide a different table name if needed, defaults to transcripts
10607        :type transcripts_table: str (optional)
10608        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10609        about the columns to be used for creating the transcript view. Each entry in the dictionary
10610        specifies the mapping between a transcripts column and a transcripts infos column. This
10611        parameter allows you to define how the columns from the transcripts table should be transformed
10612        or mapped
10613        :type column_formats: dict
10614        :param temporary_tables: The `temporary_tables` parameter in the
10615        `create_transcript_view_from_column_format` function is a list that stores the names of
10616        temporary views created during the process of creating a transcript view from a column format.
10617        These temporary views are used to manipulate and extract data before generating the final
10618        transcript view
10619        :type temporary_tables: list
10620        :param annotation_fields: The `annotation_fields` parameter in the
10621        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10622        that are extracted from the temporary views created during the process. These annotation fields
10623        are obtained by querying the temporary views and extracting the column names excluding specific
10624        columns like `#CH
10625        :type annotation_fields: list
10626        :param column_rename: The `column_rename` parameter in the
10627        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10628        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10629        column names to new column names in this dictionary, you can rename specific columns during the
10630        process
10631        :type column_rename: dict
10632        :param column_clean: The `column_clean` parameter in the
10633        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10634        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10635        will be cleaned during the creation of the transcript view based on the specified column format,
10636        defaults to False
10637        :type column_clean: bool (optional)
10638        :param column_case: The `column_case` parameter in the
10639        `create_transcript_view_from_column_format` function is used to specify the case transformation
10640        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10641        to convert the column names to uppercase or lowercase, respectively
10642        :type column_case: str
10643        :return: The `create_transcript_view_from_column_format` function returns two lists:
10644        `temporary_tables` and `annotation_fields`.
10645        """
10646
10647        log.debug("Start transcrpts view creation from column format...")
10648
10649        #  "from_column_format": [
10650        #     {
10651        #         "transcripts_column": "ANN",
10652        #         "transcripts_infos_column": "Feature_ID",
10653        #     }
10654        # ],
10655
10656        # Init
10657        if temporary_tables is None:
10658            temporary_tables = []
10659        if annotation_fields is None:
10660            annotation_fields = []
10661
10662        for column_format in column_formats:
10663
10664            # annotation field and transcript annotation field
10665            annotation_field = column_format.get("transcripts_column", "ANN")
10666            transcript_annotation = column_format.get(
10667                "transcripts_infos_column", "Feature_ID"
10668            )
10669
10670            # Transcripts infos columns rename
10671            column_rename = column_format.get("column_rename", column_rename)
10672
10673            # Transcripts infos columns clean
10674            column_clean = column_format.get("column_clean", column_clean)
10675
10676            # Transcripts infos columns case
10677            column_case = column_format.get("column_case", column_case)
10678
10679            # Temporary View name
10680            temporary_view_name = transcripts_table + "".join(
10681                random.choices(string.ascii_uppercase + string.digits, k=10)
10682            )
10683
10684            # Create temporary view name
10685            temporary_view_name = self.annotation_format_to_table(
10686                uniquify=True,
10687                annotation_field=annotation_field,
10688                view_name=temporary_view_name,
10689                annotation_id=transcript_annotation,
10690                column_rename=column_rename,
10691                column_clean=column_clean,
10692                column_case=column_case,
10693            )
10694
10695            # Annotation fields
10696            if temporary_view_name:
10697                query_annotation_fields = f"""
10698                    SELECT *
10699                    FROM (
10700                        DESCRIBE SELECT *
10701                        FROM {temporary_view_name}
10702                        )
10703                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10704                """
10705                df_annotation_fields = self.get_query_to_df(
10706                    query=query_annotation_fields
10707                )
10708
10709                # Add temporary view and annotation fields
10710                temporary_tables.append(temporary_view_name)
10711                annotation_fields += list(set(df_annotation_fields["column_name"]))
10712
10713        return temporary_tables, annotation_fields
10714
10715    def create_transcript_view(
10716        self,
10717        transcripts_table: str = None,
10718        transcripts_table_drop: bool = True,
10719        param: dict = {},
10720    ) -> str:
10721        """
10722        The `create_transcript_view` function generates a transcript view by processing data from a
10723        specified table based on provided parameters and structural information.
10724
10725        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10726        is used to specify the name of the table that will store the final transcript view data. If a table
10727        name is not provided, the function will create a new table to store the transcript view data, and by
10728        default,, defaults to transcripts
10729        :type transcripts_table: str (optional)
10730        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10731        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10732        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10733        the function will drop the existing transcripts table if it exists, defaults to True
10734        :type transcripts_table_drop: bool (optional)
10735        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10736        contains information needed to create a transcript view. It includes details such as the structure
10737        of the transcripts, columns mapping, column formats, and other necessary information for generating
10738        the view. This parameter allows for flexibility and customization
10739        :type param: dict
10740        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10741        created or modified during the execution of the function.
10742        """
10743
10744        log.debug("Start transcripts view creation...")
10745
10746        # Default
10747        transcripts_table_default = "transcripts"
10748
10749        # Param
10750        if not param:
10751            param = self.get_param()
10752
10753        # Struct
10754        struct = param.get("transcripts", {}).get("struct", None)
10755
10756        # Transcript veresion
10757        transcript_id_remove_version = param.get("transcripts", {}).get(
10758            "transcript_id_remove_version", False
10759        )
10760
10761        # Transcripts mapping
10762        transcript_id_mapping_file = param.get("transcripts", {}).get(
10763            "transcript_id_mapping_file", None
10764        )
10765
10766        # Transcripts mapping
10767        transcript_id_mapping_force = param.get("transcripts", {}).get(
10768            "transcript_id_mapping_force", None
10769        )
10770
10771        if struct:
10772
10773            # Transcripts table
10774            if transcripts_table is None:
10775                transcripts_table = param.get("transcripts", {}).get(
10776                    "table", transcripts_table_default
10777                )
10778
10779            # added_columns
10780            added_columns = []
10781
10782            # Temporary tables
10783            temporary_tables = []
10784
10785            # Annotation fields
10786            annotation_fields = []
10787
10788            # from columns map
10789            columns_maps = struct.get("from_columns_map", [])
10790            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10791                self.create_transcript_view_from_columns_map(
10792                    transcripts_table=transcripts_table,
10793                    columns_maps=columns_maps,
10794                    added_columns=added_columns,
10795                    temporary_tables=temporary_tables,
10796                    annotation_fields=annotation_fields,
10797                )
10798            )
10799            added_columns += added_columns_tmp
10800            temporary_tables += temporary_tables_tmp
10801            annotation_fields += annotation_fields_tmp
10802
10803            # from column format
10804            column_formats = struct.get("from_column_format", [])
10805            temporary_tables_tmp, annotation_fields_tmp = (
10806                self.create_transcript_view_from_column_format(
10807                    transcripts_table=transcripts_table,
10808                    column_formats=column_formats,
10809                    temporary_tables=temporary_tables,
10810                    annotation_fields=annotation_fields,
10811                )
10812            )
10813            temporary_tables += temporary_tables_tmp
10814            annotation_fields += annotation_fields_tmp
10815
10816            # Remove some specific fields/column
10817            annotation_fields = list(set(annotation_fields))
10818            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10819                if field in annotation_fields:
10820                    annotation_fields.remove(field)
10821
10822            # Merge temporary tables query
10823            query_merge = ""
10824            for temporary_table in list(set(temporary_tables)):
10825
10826                # First temporary table
10827                if not query_merge:
10828                    query_merge = f"""
10829                        SELECT * FROM {temporary_table}
10830                    """
10831                # other temporary table (using UNION)
10832                else:
10833                    query_merge += f"""
10834                        UNION BY NAME SELECT * FROM {temporary_table}
10835                    """
10836
10837            # transcript table tmp
10838            transcript_table_tmp = "transcripts_tmp"
10839            transcript_table_tmp2 = "transcripts_tmp2"
10840            transcript_table_tmp3 = "transcripts_tmp3"
10841
10842            # Merge on transcript
10843            query_merge_on_transcripts_annotation_fields = []
10844
10845            # Add transcript list
10846            query_merge_on_transcripts_annotation_fields.append(
10847                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10848            )
10849
10850            # Aggregate all annotations fields
10851            for annotation_field in set(annotation_fields):
10852                query_merge_on_transcripts_annotation_fields.append(
10853                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10854                )
10855
10856            # Transcripts mapping
10857            if transcript_id_mapping_file:
10858
10859                # Transcript dataframe
10860                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10861                transcript_id_mapping_dataframe = transcripts_file_to_df(
10862                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10863                )
10864
10865                # Transcript version remove
10866                if transcript_id_remove_version:
10867                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10868                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10869                    query_left_join = f"""
10870                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10871                    """
10872                else:
10873                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10874                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10875                    query_left_join = f"""
10876                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10877                    """
10878
10879                # Transcript column for group by merge
10880                query_transcript_merge_group_by = """
10881                        CASE
10882                            WHEN transcript_mapped NOT IN ('')
10883                            THEN split_part(transcript_mapped, '.', 1)
10884                            ELSE split_part(transcript_original, '.', 1)
10885                        END
10886                    """
10887
10888                # Merge query
10889                transcripts_tmp2_query = f"""
10890                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10891                    FROM ({query_merge}) AS {transcript_table_tmp}
10892                    {query_left_join}
10893                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10894                """
10895
10896                # Retrive columns after mege
10897                transcripts_tmp2_describe_query = f"""
10898                    DESCRIBE {transcripts_tmp2_query}
10899                """
10900                transcripts_tmp2_describe_list = list(
10901                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10902                        "column_name"
10903                    ]
10904                )
10905
10906                # Create list of columns for select clause
10907                transcripts_tmp2_describe_select_clause = []
10908                for field in transcripts_tmp2_describe_list:
10909                    if field not in [
10910                        "#CHROM",
10911                        "POS",
10912                        "REF",
10913                        "ALT",
10914                        "INFO",
10915                        "transcript_mapped",
10916                    ]:
10917                        as_field = field
10918                        if field in ["transcript_original"]:
10919                            as_field = "transcripts_mapped"
10920                        transcripts_tmp2_describe_select_clause.append(
10921                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
10922                        )
10923
10924                # Merge with mapping
10925                query_merge_on_transcripts = f"""
10926                    SELECT
10927                        "#CHROM", POS, REF, ALT, INFO,
10928                        CASE
10929                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
10930                            THEN ANY_VALUE(transcript_mapped)
10931                            ELSE ANY_VALUE(transcript_original)
10932                        END AS transcript,
10933                        {", ".join(transcripts_tmp2_describe_select_clause)}
10934                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
10935                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
10936                        {query_transcript_merge_group_by}
10937                """
10938
10939                # Add transcript filter from mapping file
10940                if transcript_id_mapping_force:
10941                    query_merge_on_transcripts = f"""
10942                        SELECT *
10943                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
10944                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
10945                    """
10946
10947            # No transcript mapping
10948            else:
10949
10950                # Remove transcript version
10951                if transcript_id_remove_version:
10952                    query_transcript_column = f"""
10953                        split_part({transcript_table_tmp}.transcript, '.', 1)
10954                    """
10955                else:
10956                    query_transcript_column = """
10957                        transcript
10958                    """
10959
10960                # Query sections
10961                query_transcript_column_select = (
10962                    f"{query_transcript_column} AS transcript"
10963                )
10964                query_transcript_column_group_by = query_transcript_column
10965
10966                # Query for transcripts view
10967                query_merge_on_transcripts = f"""
10968                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
10969                    FROM ({query_merge}) AS {transcript_table_tmp}
10970                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
10971                """
10972
10973            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
10974
10975            # Drop transcript view is necessary
10976            if transcripts_table_drop:
10977                query_drop = f"""
10978                    DROP TABLE IF EXISTS {transcripts_table};
10979                """
10980                self.execute_query(query=query_drop)
10981
10982            # Merge and create transcript view
10983            query_create_view = f"""
10984                CREATE TABLE IF NOT EXISTS {transcripts_table}
10985                AS {query_merge_on_transcripts}
10986            """
10987            self.execute_query(query=query_create_view)
10988
10989            # Remove added columns
10990            for added_column in added_columns:
10991                self.drop_column(column=added_column)
10992
10993        else:
10994
10995            transcripts_table = None
10996
10997        return transcripts_table
10998
10999    def annotation_format_to_table(
11000        self,
11001        uniquify: bool = True,
11002        annotation_field: str = "ANN",
11003        annotation_id: str = "Feature_ID",
11004        view_name: str = "transcripts",
11005        column_rename: dict = {},
11006        column_clean: bool = False,
11007        column_case: str = None,
11008    ) -> str:
11009        """
11010        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11011        structured table format, ensuring unique values and creating a temporary table for further
11012        processing or analysis.
11013
11014        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11015        unique values in the output or not. If set to `True`, the function will make sure that the
11016        output values are unique, defaults to True
11017        :type uniquify: bool (optional)
11018        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11019        that contains the annotation information for each variant. This field is used to extract the
11020        annotation details for further processing in the function. By default, it is set to "ANN",
11021        defaults to ANN
11022        :type annotation_field: str (optional)
11023        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11024        is used to specify the identifier for the annotation feature. This identifier will be used as a
11025        column name in the resulting table or view that is created based on the annotation data. It
11026        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11027        :type annotation_id: str (optional)
11028        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11029        to specify the name of the temporary table that will be created to store the transformed
11030        annotation data. This table will hold the extracted information from the annotation field in a
11031        structured format for further processing or analysis. By default,, defaults to transcripts
11032        :type view_name: str (optional)
11033        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11034        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11035        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11036        created based on the annotation data. This feature enables
11037        :type column_rename: dict
11038        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11039        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11040        If set to `True`, the function will clean the annotation field before further processing. This
11041        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11042        to False
11043        :type column_clean: bool (optional)
11044        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11045        used to specify the case transformation to be applied to the column names extracted from the
11046        annotation data. It allows you to set the case of the column names to either lowercase or
11047        uppercase for consistency or other specific requirements during the conversion
11048        :type column_case: str
11049        :return: The function `annotation_format_to_table` is returning the name of the view created,
11050        which is stored in the variable `view_name`.
11051        """
11052
11053        # Annotation field
11054        annotation_format = "annotation_explode"
11055
11056        # Transcript annotation
11057        if column_rename:
11058            annotation_id = column_rename.get(annotation_id, annotation_id)
11059
11060        if column_clean:
11061            annotation_id = clean_annotation_field(annotation_id)
11062
11063        # Prefix
11064        prefix = self.get_explode_infos_prefix()
11065        if prefix:
11066            prefix = "INFO/"
11067
11068        # Annotation fields
11069        annotation_infos = prefix + annotation_field
11070        annotation_format_infos = prefix + annotation_format
11071
11072        # Variants table
11073        table_variants = self.get_table_variants()
11074
11075        # Header
11076        vcf_reader = self.get_header()
11077
11078        # Add columns
11079        added_columns = []
11080
11081        # Explode HGVS field in column
11082        added_columns += self.explode_infos(fields=[annotation_field])
11083
11084        if annotation_field in vcf_reader.infos:
11085
11086            # Extract ANN header
11087            ann_description = vcf_reader.infos[annotation_field].desc
11088            pattern = r"'(.+?)'"
11089            match = re.search(pattern, ann_description)
11090            if match:
11091                ann_header_match = match.group(1).split(" | ")
11092                ann_header = []
11093                ann_header_desc = {}
11094                for i in range(len(ann_header_match)):
11095                    ann_header_info = "".join(
11096                        char for char in ann_header_match[i] if char.isalnum()
11097                    )
11098                    ann_header.append(ann_header_info)
11099                    ann_header_desc[ann_header_info] = ann_header_match[i]
11100                if not ann_header_desc:
11101                    raise ValueError("Invalid header description format")
11102            else:
11103                raise ValueError("Invalid header description format")
11104
11105            # Create variant id
11106            variant_id_column = self.get_variant_id_column()
11107            added_columns += [variant_id_column]
11108
11109            # Create dataframe
11110            dataframe_annotation_format = self.get_query_to_df(
11111                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11112            )
11113
11114            # Create annotation columns
11115            dataframe_annotation_format[
11116                annotation_format_infos
11117            ] = dataframe_annotation_format[annotation_infos].apply(
11118                lambda x: explode_annotation_format(
11119                    annotation=str(x),
11120                    uniquify=uniquify,
11121                    output_format="JSON",
11122                    prefix="",
11123                    header=list(ann_header_desc.values()),
11124                )
11125            )
11126
11127            # Find keys
11128            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11129            df_keys = self.get_query_to_df(query=query_json)
11130
11131            # Check keys
11132            query_json_key = []
11133            for _, row in df_keys.iterrows():
11134
11135                # Key
11136                key = row.iloc[0]
11137                key_clean = key
11138
11139                # key rename
11140                if column_rename:
11141                    key_clean = column_rename.get(key_clean, key_clean)
11142
11143                # key clean
11144                if column_clean:
11145                    key_clean = clean_annotation_field(key_clean)
11146
11147                # Key case
11148                if column_case:
11149                    if column_case.lower() in ["lower"]:
11150                        key_clean = key_clean.lower()
11151                    elif column_case.lower() in ["upper"]:
11152                        key_clean = key_clean.upper()
11153
11154                # Type
11155                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11156
11157                # Get DataFrame from query
11158                df_json_type = self.get_query_to_df(query=query_json_type)
11159
11160                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11161                with pd.option_context("future.no_silent_downcasting", True):
11162                    df_json_type.fillna(value="", inplace=True)
11163                    replace_dict = {None: np.nan, "": np.nan}
11164                    df_json_type.replace(replace_dict, inplace=True)
11165                    df_json_type.dropna(inplace=True)
11166
11167                # Detect column type
11168                column_type = detect_column_type(df_json_type[key_clean])
11169
11170                # Append
11171                query_json_key.append(
11172                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11173                )
11174
11175            # Create view
11176            query_view = f"""
11177                CREATE TEMPORARY TABLE {view_name}
11178                AS (
11179                    SELECT *, {annotation_id} AS 'transcript'
11180                    FROM (
11181                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11182                        FROM dataframe_annotation_format
11183                        )
11184                    );
11185            """
11186            self.execute_query(query=query_view)
11187
11188        else:
11189
11190            # Return None
11191            view_name = None
11192
11193        # Remove added columns
11194        for added_column in added_columns:
11195            self.drop_column(column=added_column)
11196
11197        return view_name
11198
11199    def transcript_view_to_variants(
11200        self,
11201        transcripts_table: str = None,
11202        transcripts_column_id: str = None,
11203        transcripts_info_json: str = None,
11204        transcripts_info_field_json: str = None,
11205        transcripts_info_format: str = None,
11206        transcripts_info_field_format: str = None,
11207        param: dict = {},
11208    ) -> bool:
11209        """
11210        The `transcript_view_to_variants` function updates a variants table with information from
11211        transcripts in JSON format.
11212
11213        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11214        table containing the transcripts data. If this parameter is not provided, the function will
11215        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11216        :type transcripts_table: str
11217        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11218        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11219        identifier is used to match transcripts with variants in the database
11220        :type transcripts_column_id: str
11221        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11222        of the column in the variants table where the transcripts information will be stored in JSON
11223        format. This parameter allows you to define the column in the variants table that will hold the
11224        JSON-formatted information about transcripts
11225        :type transcripts_info_json: str
11226        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11227        specify the field in the VCF header that will contain information about transcripts in JSON
11228        format. This field will be added to the VCF header as an INFO field with the specified name
11229        :type transcripts_info_field_json: str
11230        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11231        format of the information about transcripts that will be stored in the variants table. This
11232        format can be used to define how the transcript information will be structured or displayed
11233        within the variants table
11234        :type transcripts_info_format: str
11235        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11236        specify the field in the VCF header that will contain information about transcripts in a
11237        specific format. This field will be added to the VCF header as an INFO field with the specified
11238        name
11239        :type transcripts_info_field_format: str
11240        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11241        that contains various configuration settings related to transcripts. It is used to provide
11242        default values for certain parameters if they are not explicitly provided when calling the
11243        method. The `param` dictionary can be passed as an argument
11244        :type param: dict
11245        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11246        if the operation is successful and `False` if certain conditions are not met.
11247        """
11248
11249        msg_info_prefix = "Start transcripts view to variants annotations"
11250
11251        log.debug(f"{msg_info_prefix}...")
11252
11253        # Default
11254        transcripts_table_default = "transcripts"
11255        transcripts_column_id_default = "transcript"
11256        transcripts_info_json_default = None
11257        transcripts_info_format_default = None
11258        transcripts_info_field_json_default = None
11259        transcripts_info_field_format_default = None
11260
11261        # Param
11262        if not param:
11263            param = self.get_param()
11264
11265        # Transcripts table
11266        if transcripts_table is None:
11267            transcripts_table = param.get("transcripts", {}).get(
11268                "table", transcripts_table_default
11269            )
11270
11271        # Transcripts column ID
11272        if transcripts_column_id is None:
11273            transcripts_column_id = param.get("transcripts", {}).get(
11274                "column_id", transcripts_column_id_default
11275            )
11276
11277        # Transcripts info json
11278        if transcripts_info_json is None:
11279            transcripts_info_json = param.get("transcripts", {}).get(
11280                "transcripts_info_json", transcripts_info_json_default
11281            )
11282
11283        # Transcripts info field JSON
11284        if transcripts_info_field_json is None:
11285            transcripts_info_field_json = param.get("transcripts", {}).get(
11286                "transcripts_info_field_json", transcripts_info_field_json_default
11287            )
11288        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11289        #     transcripts_info_json = transcripts_info_field_json
11290
11291        # Transcripts info format
11292        if transcripts_info_format is None:
11293            transcripts_info_format = param.get("transcripts", {}).get(
11294                "transcripts_info_format", transcripts_info_format_default
11295            )
11296
11297        # Transcripts info field FORMAT
11298        if transcripts_info_field_format is None:
11299            transcripts_info_field_format = param.get("transcripts", {}).get(
11300                "transcripts_info_field_format", transcripts_info_field_format_default
11301            )
11302        # if (
11303        #     transcripts_info_field_format is not None
11304        #     and transcripts_info_format is None
11305        # ):
11306        #     transcripts_info_format = transcripts_info_field_format
11307
11308        # Variants table
11309        table_variants = self.get_table_variants()
11310
11311        # Check info columns param
11312        if (
11313            transcripts_info_json is None
11314            and transcripts_info_field_json is None
11315            and transcripts_info_format is None
11316            and transcripts_info_field_format is None
11317        ):
11318            return False
11319
11320        # Transcripts infos columns
11321        query_transcripts_infos_columns = f"""
11322            SELECT *
11323            FROM (
11324                DESCRIBE SELECT * FROM {transcripts_table}
11325                )
11326            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11327        """
11328        transcripts_infos_columns = list(
11329            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11330        )
11331
11332        # View results
11333        clause_select = []
11334        clause_to_json = []
11335        clause_to_format = []
11336        for field in transcripts_infos_columns:
11337            # Do not consider INFO field for export into fields
11338            if field not in ["INFO"]:
11339                clause_select.append(
11340                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11341                )
11342                clause_to_json.append(f""" '{field}': "{field}" """)
11343                clause_to_format.append(f""" "{field}" """)
11344
11345        # Update
11346        update_set_json = []
11347        update_set_format = []
11348
11349        # VCF header
11350        vcf_reader = self.get_header()
11351
11352        # Transcripts to info column in JSON
11353        if transcripts_info_json:
11354
11355            # Create column on variants table
11356            self.add_column(
11357                table_name=table_variants,
11358                column_name=transcripts_info_json,
11359                column_type="JSON",
11360                default_value=None,
11361                drop=False,
11362            )
11363
11364            # Add header
11365            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11366                transcripts_info_json,
11367                ".",
11368                "String",
11369                "Transcripts in JSON format",
11370                "unknwon",
11371                "unknwon",
11372                self.code_type_map["String"],
11373            )
11374
11375            # Add to update
11376            update_set_json.append(
11377                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11378            )
11379
11380        # Transcripts to info field in JSON
11381        if transcripts_info_field_json:
11382
11383            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11384
11385            # Add to update
11386            update_set_json.append(
11387                f""" 
11388                    INFO = concat(
11389                            CASE
11390                                WHEN INFO NOT IN ('', '.')
11391                                THEN INFO
11392                                ELSE ''
11393                            END,
11394                            CASE
11395                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11396                                THEN concat(
11397                                    ';{transcripts_info_field_json}=',
11398                                    t.{transcripts_info_json}
11399                                )
11400                                ELSE ''
11401                            END
11402                            )
11403                """
11404            )
11405
11406            # Add header
11407            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11408                transcripts_info_field_json,
11409                ".",
11410                "String",
11411                "Transcripts in JSON format",
11412                "unknwon",
11413                "unknwon",
11414                self.code_type_map["String"],
11415            )
11416
11417        if update_set_json:
11418
11419            # Update query
11420            query_update = f"""
11421                UPDATE {table_variants}
11422                    SET {", ".join(update_set_json)}
11423                FROM
11424                (
11425                    SELECT
11426                        "#CHROM", POS, REF, ALT,
11427                            concat(
11428                            '{{',
11429                            string_agg(
11430                                '"' || "{transcripts_column_id}" || '":' ||
11431                                to_json(json_output)
11432                            ),
11433                            '}}'
11434                            )::JSON AS {transcripts_info_json}
11435                    FROM
11436                        (
11437                        SELECT
11438                            "#CHROM", POS, REF, ALT,
11439                            "{transcripts_column_id}",
11440                            to_json(
11441                                {{{",".join(clause_to_json)}}}
11442                            )::JSON AS json_output
11443                        FROM
11444                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11445                        WHERE "{transcripts_column_id}" IS NOT NULL
11446                        )
11447                    GROUP BY "#CHROM", POS, REF, ALT
11448                ) AS t
11449                WHERE {table_variants}."#CHROM" = t."#CHROM"
11450                    AND {table_variants}."POS" = t."POS"
11451                    AND {table_variants}."REF" = t."REF"
11452                    AND {table_variants}."ALT" = t."ALT"
11453            """
11454
11455            self.execute_query(query=query_update)
11456
11457        # Transcripts to info column in FORMAT
11458        if transcripts_info_format:
11459
11460            # Create column on variants table
11461            self.add_column(
11462                table_name=table_variants,
11463                column_name=transcripts_info_format,
11464                column_type="VARCHAR",
11465                default_value=None,
11466                drop=False,
11467            )
11468
11469            # Add header
11470            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11471                transcripts_info_format,
11472                ".",
11473                "String",
11474                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11475                "unknwon",
11476                "unknwon",
11477                self.code_type_map["String"],
11478            )
11479
11480            # Add to update
11481            update_set_format.append(
11482                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11483            )
11484
11485        else:
11486
11487            # Set variable for internal queries
11488            transcripts_info_format = "transcripts_info_format"
11489
11490        # Transcripts to info field in JSON
11491        if transcripts_info_field_format:
11492
11493            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11494
11495            # Add to update
11496            update_set_format.append(
11497                f""" 
11498                    INFO = concat(
11499                            CASE
11500                                WHEN INFO NOT IN ('', '.')
11501                                THEN INFO
11502                                ELSE ''
11503                            END,
11504                            CASE
11505                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11506                                THEN concat(
11507                                    ';{transcripts_info_field_format}=',
11508                                    t.{transcripts_info_format}
11509                                )
11510                                ELSE ''
11511                            END
11512                            )
11513                """
11514            )
11515
11516            # Add header
11517            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11518                transcripts_info_field_format,
11519                ".",
11520                "String",
11521                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11522                "unknwon",
11523                "unknwon",
11524                self.code_type_map["String"],
11525            )
11526
11527        if update_set_format:
11528
11529            # Update query
11530            query_update = f"""
11531                UPDATE {table_variants}
11532                    SET {", ".join(update_set_format)}
11533                FROM
11534                (
11535                    SELECT
11536                        "#CHROM", POS, REF, ALT,
11537                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11538                    FROM 
11539                        (
11540                        SELECT
11541                            "#CHROM", POS, REF, ALT,
11542                            "{transcripts_column_id}",
11543                            concat(
11544                                "{transcripts_column_id}",
11545                                '|',
11546                                {", '|', ".join(clause_to_format)}
11547                            ) AS {transcripts_info_format}
11548                        FROM
11549                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11550                        )
11551                    GROUP BY "#CHROM", POS, REF, ALT
11552                ) AS t
11553                WHERE {table_variants}."#CHROM" = t."#CHROM"
11554                    AND {table_variants}."POS" = t."POS"
11555                    AND {table_variants}."REF" = t."REF"
11556                    AND {table_variants}."ALT" = t."ALT"
11557            """
11558
11559            self.execute_query(query=query_update)
11560
11561        return True
class Variants:
   36class Variants:
   37
   38    def __init__(
   39        self,
   40        conn=None,
   41        input: str = None,
   42        output: str = None,
   43        config: dict = {},
   44        param: dict = {},
   45        load: bool = False,
   46    ) -> None:
   47        """
   48        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   49        header
   50
   51        :param conn: the connection to the database
   52        :param input: the input file
   53        :param output: the output file
   54        :param config: a dictionary containing the configuration of the model
   55        :param param: a dictionary containing the parameters of the model
   56        """
   57
   58        # Init variables
   59        self.init_variables()
   60
   61        # Input
   62        self.set_input(input)
   63
   64        # Config
   65        self.set_config(config)
   66
   67        # Param
   68        self.set_param(param)
   69
   70        # Output
   71        self.set_output(output)
   72
   73        # connexion
   74        self.set_connexion(conn)
   75
   76        # Header
   77        self.set_header()
   78
   79        # Samples
   80        self.set_samples()
   81
   82        # Load data
   83        if load:
   84            self.load_data()
   85
   86    def set_samples(self, samples: list = None) -> list:
   87        """
   88        The function `set_samples` sets the samples attribute of an object to a provided list or
   89        retrieves it from a parameter dictionary.
   90
   91        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   92        input and sets the `samples` attribute of the class to the provided list. If no samples are
   93        provided, it tries to get the samples from the class's parameters using the `get_param` method
   94        :type samples: list
   95        :return: The `samples` list is being returned.
   96        """
   97
   98        if not samples:
   99            samples = self.get_param().get("samples", {}).get("list", None)
  100
  101        self.samples = samples
  102
  103        return samples
  104
  105    def get_samples(self) -> list:
  106        """
  107        This function returns a list of samples.
  108        :return: The `get_samples` method is returning the `samples` attribute of the object.
  109        """
  110
  111        return self.samples
  112
  113    def get_samples_check(self) -> bool:
  114        """
  115        This function returns the value of the "check" key within the "samples" dictionary retrieved
  116        from the parameters.
  117        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  118        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  119        method. If the key "check" is not found, it will return `False`.
  120        """
  121
  122        return self.get_param().get("samples", {}).get("check", True)
  123
  124    def set_input(self, input: str = None) -> None:
  125        """
  126        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  127        attributes in the class accordingly.
  128
  129        :param input: The `set_input` method in the provided code snippet is used to set attributes
  130        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  131        :type input: str
  132        """
  133
  134        if input and not isinstance(input, str):
  135            try:
  136                self.input = input.name
  137            except:
  138                log.error(f"Input file '{input} in bad format")
  139                raise ValueError(f"Input file '{input} in bad format")
  140        else:
  141            self.input = input
  142
  143        # Input format
  144        if input:
  145            input_name, input_extension = os.path.splitext(self.input)
  146            self.input_name = input_name
  147            self.input_extension = input_extension
  148            self.input_format = self.input_extension.replace(".", "")
  149
  150    def set_config(self, config: dict) -> None:
  151        """
  152        The set_config function takes a config object and assigns it as the configuration object for the
  153        class.
  154
  155        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  156        contains configuration settings for the class. When you call the `set_config` function with a
  157        dictionary object as the argument, it will set that dictionary as the configuration object for
  158        the class
  159        :type config: dict
  160        """
  161
  162        self.config = config
  163
  164    def set_param(self, param: dict) -> None:
  165        """
  166        This function sets a parameter object for the class based on the input dictionary.
  167
  168        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  169        as the `param` attribute of the class instance
  170        :type param: dict
  171        """
  172
  173        self.param = param
  174
  175    def init_variables(self) -> None:
  176        """
  177        This function initializes the variables that will be used in the rest of the class
  178        """
  179
  180        self.prefix = "howard"
  181        self.table_variants = "variants"
  182        self.dataframe = None
  183
  184        self.comparison_map = {
  185            "gt": ">",
  186            "gte": ">=",
  187            "lt": "<",
  188            "lte": "<=",
  189            "equals": "=",
  190            "contains": "SIMILAR TO",
  191        }
  192
  193        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  194
  195        self.code_type_map_to_sql = {
  196            "Integer": "INTEGER",
  197            "String": "VARCHAR",
  198            "Float": "FLOAT",
  199            "Flag": "VARCHAR",
  200        }
  201
  202        self.index_additionnal_fields = []
  203
  204    def get_indexing(self) -> bool:
  205        """
  206        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  207        returns False.
  208        :return: The value of the indexing parameter.
  209        """
  210
  211        return self.get_param().get("indexing", False)
  212
  213    def get_connexion_config(self) -> dict:
  214        """
  215        The function `get_connexion_config` returns a dictionary containing the configuration for a
  216        connection, including the number of threads and memory limit.
  217        :return: a dictionary containing the configuration for the Connexion library.
  218        """
  219
  220        # config
  221        config = self.get_config()
  222
  223        # Connexion config
  224        connexion_config = {}
  225        threads = self.get_threads()
  226
  227        # Threads
  228        if threads:
  229            connexion_config["threads"] = threads
  230
  231        # Memory
  232        # if config.get("memory", None):
  233        #     connexion_config["memory_limit"] = config.get("memory")
  234        if self.get_memory():
  235            connexion_config["memory_limit"] = self.get_memory()
  236
  237        # Temporary directory
  238        if config.get("tmp", None):
  239            connexion_config["temp_directory"] = config.get("tmp")
  240
  241        # Access
  242        if config.get("access", None):
  243            access = config.get("access")
  244            if access in ["RO"]:
  245                access = "READ_ONLY"
  246            elif access in ["RW"]:
  247                access = "READ_WRITE"
  248            connexion_db = self.get_connexion_db()
  249            if connexion_db in ":memory:":
  250                access = "READ_WRITE"
  251            connexion_config["access_mode"] = access
  252
  253        return connexion_config
  254
  255    def get_duckdb_settings(self) -> dict:
  256        """
  257        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  258        string.
  259        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  260        """
  261
  262        # config
  263        config = self.get_config()
  264
  265        # duckdb settings
  266        duckdb_settings_dict = {}
  267        if config.get("duckdb_settings", None):
  268            duckdb_settings = config.get("duckdb_settings")
  269            duckdb_settings = full_path(duckdb_settings)
  270            # duckdb setting is a file
  271            if os.path.exists(duckdb_settings):
  272                with open(duckdb_settings) as json_file:
  273                    duckdb_settings_dict = yaml.safe_load(json_file)
  274            # duckdb settings is a string
  275            else:
  276                duckdb_settings_dict = json.loads(duckdb_settings)
  277
  278        return duckdb_settings_dict
  279
  280    def set_connexion_db(self) -> str:
  281        """
  282        The function `set_connexion_db` returns the appropriate database connection string based on the
  283        input format and connection type.
  284        :return: the value of the variable `connexion_db`.
  285        """
  286
  287        # Default connexion db
  288        default_connexion_db = ":memory:"
  289
  290        # Find connexion db
  291        if self.get_input_format() in ["db", "duckdb"]:
  292            connexion_db = self.get_input()
  293        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  294            connexion_db = default_connexion_db
  295        elif self.get_connexion_type() in ["tmpfile"]:
  296            tmp_name = tempfile.mkdtemp(
  297                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  298            )
  299            connexion_db = f"{tmp_name}/tmp.db"
  300        elif self.get_connexion_type() != "":
  301            connexion_db = self.get_connexion_type()
  302        else:
  303            connexion_db = default_connexion_db
  304
  305        # Set connexion db
  306        self.connexion_db = connexion_db
  307
  308        return connexion_db
  309
  310    def set_connexion(self, conn) -> None:
  311        """
  312        The function `set_connexion` creates a connection to a database, with options for different
  313        database formats and settings.
  314
  315        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  316        database. If a connection is not provided, a new connection to an in-memory database is created.
  317        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  318        sqlite
  319        """
  320
  321        # Connexion db
  322        connexion_db = self.set_connexion_db()
  323
  324        # Connexion config
  325        connexion_config = self.get_connexion_config()
  326
  327        # Connexion format
  328        connexion_format = self.get_config().get("connexion_format", "duckdb")
  329        # Set connexion format
  330        self.connexion_format = connexion_format
  331
  332        # Connexion
  333        if not conn:
  334            if connexion_format in ["duckdb"]:
  335                conn = duckdb.connect(connexion_db, config=connexion_config)
  336                # duckDB settings
  337                duckdb_settings = self.get_duckdb_settings()
  338                if duckdb_settings:
  339                    for setting in duckdb_settings:
  340                        setting_value = duckdb_settings.get(setting)
  341                        if isinstance(setting_value, str):
  342                            setting_value = f"'{setting_value}'"
  343                        conn.execute(f"PRAGMA {setting}={setting_value};")
  344            elif connexion_format in ["sqlite"]:
  345                conn = sqlite3.connect(connexion_db)
  346
  347        # Set connexion
  348        self.conn = conn
  349
  350        # Log
  351        log.debug(f"connexion_format: {connexion_format}")
  352        log.debug(f"connexion_db: {connexion_db}")
  353        log.debug(f"connexion config: {connexion_config}")
  354        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  355
  356    def set_output(self, output: str = None) -> None:
  357        """
  358        The `set_output` function in Python sets the output file based on the input or a specified key
  359        in the config file, extracting the output name, extension, and format.
  360
  361        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  362        the output file. If the config file has an 'output' key, the method sets the output to the value
  363        of that key. If no output is provided, it sets the output to `None`
  364        :type output: str
  365        """
  366
  367        if output and not isinstance(output, str):
  368            self.output = output.name
  369        else:
  370            self.output = output
  371
  372        # Output format
  373        if self.output:
  374            output_name, output_extension = os.path.splitext(self.output)
  375            self.output_name = output_name
  376            self.output_extension = output_extension
  377            self.output_format = self.output_extension.replace(".", "")
  378        else:
  379            self.output_name = None
  380            self.output_extension = None
  381            self.output_format = None
  382
  383    def set_header(self) -> None:
  384        """
  385        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  386        """
  387
  388        input_file = self.get_input()
  389        default_header_list = [
  390            "##fileformat=VCFv4.2",
  391            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  392        ]
  393
  394        # Full path
  395        input_file = full_path(input_file)
  396
  397        if input_file:
  398
  399            input_format = self.get_input_format()
  400            input_compressed = self.get_input_compressed()
  401            config = self.get_config()
  402            header_list = default_header_list
  403            if input_format in [
  404                "vcf",
  405                "hdr",
  406                "tsv",
  407                "csv",
  408                "psv",
  409                "parquet",
  410                "db",
  411                "duckdb",
  412            ]:
  413                # header provided in param
  414                if config.get("header_file", None):
  415                    with open(config.get("header_file"), "rt") as f:
  416                        header_list = self.read_vcf_header(f)
  417                # within a vcf file format (header within input file itsself)
  418                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  419                    # within a compressed vcf file format (.vcf.gz)
  420                    if input_compressed:
  421                        with bgzf.open(input_file, "rt") as f:
  422                            header_list = self.read_vcf_header(f)
  423                    # within an uncompressed vcf file format (.vcf)
  424                    else:
  425                        with open(input_file, "rt") as f:
  426                            header_list = self.read_vcf_header(f)
  427                # header provided in default external file .hdr
  428                elif os.path.exists((input_file + ".hdr")):
  429                    with open(input_file + ".hdr", "rt") as f:
  430                        header_list = self.read_vcf_header(f)
  431                else:
  432                    try:  # Try to get header info fields and file columns
  433
  434                        with tempfile.TemporaryDirectory() as tmpdir:
  435
  436                            # Create database
  437                            db_for_header = Database(database=input_file)
  438
  439                            # Get header columns for infos fields
  440                            db_header_from_columns = (
  441                                db_for_header.get_header_from_columns()
  442                            )
  443
  444                            # Get real columns in the file
  445                            db_header_columns = db_for_header.get_columns()
  446
  447                            # Write header file
  448                            header_file_tmp = os.path.join(tmpdir, "header")
  449                            f = open(header_file_tmp, "w")
  450                            vcf.Writer(f, db_header_from_columns)
  451                            f.close()
  452
  453                            # Replace #CHROM line with rel columns
  454                            header_list = db_for_header.read_header_file(
  455                                header_file=header_file_tmp
  456                            )
  457                            header_list[-1] = "\t".join(db_header_columns)
  458
  459                    except:
  460
  461                        log.warning(
  462                            f"No header for file {input_file}. Set as default VCF header"
  463                        )
  464                        header_list = default_header_list
  465
  466            else:  # try for unknown format ?
  467
  468                log.error(f"Input file format '{input_format}' not available")
  469                raise ValueError(f"Input file format '{input_format}' not available")
  470
  471            if not header_list:
  472                header_list = default_header_list
  473
  474            # header as list
  475            self.header_list = header_list
  476
  477            # header as VCF object
  478            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  479
  480        else:
  481
  482            self.header_list = None
  483            self.header_vcf = None
  484
  485    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  486        """
  487        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  488        DataFrame based on the connection format.
  489
  490        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  491        represents the SQL query you want to execute. This query will be used to fetch data from a
  492        database and convert it into a pandas DataFrame
  493        :type query: str
  494        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  495        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  496        function will only fetch up to that number of rows from the database query result. If no limit
  497        is specified,
  498        :type limit: int
  499        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  500        """
  501
  502        # Connexion format
  503        connexion_format = self.get_connexion_format()
  504
  505        # Limit in query
  506        if limit:
  507            pd.set_option("display.max_rows", limit)
  508            if connexion_format in ["duckdb"]:
  509                df = (
  510                    self.conn.execute(query)
  511                    .fetch_record_batch(limit)
  512                    .read_next_batch()
  513                    .to_pandas()
  514                )
  515            elif connexion_format in ["sqlite"]:
  516                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  517
  518        # Full query
  519        else:
  520            if connexion_format in ["duckdb"]:
  521                df = self.conn.execute(query).df()
  522            elif connexion_format in ["sqlite"]:
  523                df = pd.read_sql_query(query, self.conn)
  524
  525        return df
  526
  527    def get_overview(self) -> None:
  528        """
  529        The function prints the input, output, config, and dataframe of the current object
  530        """
  531        table_variants_from = self.get_table_variants(clause="from")
  532        sql_columns = self.get_header_columns_as_sql()
  533        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  534        df = self.get_query_to_df(sql_query_export)
  535        log.info(
  536            "Input:  "
  537            + str(self.get_input())
  538            + " ["
  539            + str(str(self.get_input_format()))
  540            + "]"
  541        )
  542        log.info(
  543            "Output: "
  544            + str(self.get_output())
  545            + " ["
  546            + str(str(self.get_output_format()))
  547            + "]"
  548        )
  549        log.info("Config: ")
  550        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  551            "\n"
  552        ):
  553            log.info("\t" + str(d))
  554        log.info("Param: ")
  555        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  556            "\n"
  557        ):
  558            log.info("\t" + str(d))
  559        log.info("Sample list: " + str(self.get_header_sample_list()))
  560        log.info("Dataframe: ")
  561        for d in str(df).split("\n"):
  562            log.info("\t" + str(d))
  563
  564        # garbage collector
  565        del df
  566        gc.collect()
  567
  568        return None
  569
  570    def get_stats(self) -> dict:
  571        """
  572        The `get_stats` function calculates and returns various statistics of the current object,
  573        including information about the input file, variants, samples, header fields, quality, and
  574        SNVs/InDels.
  575        :return: a dictionary containing various statistics of the current object. The dictionary has
  576        the following structure:
  577        """
  578
  579        # Log
  580        log.info(f"Stats Calculation...")
  581
  582        # table varaints
  583        table_variants_from = self.get_table_variants()
  584
  585        # stats dict
  586        stats = {"Infos": {}}
  587
  588        ### File
  589        input_file = self.get_input()
  590        stats["Infos"]["Input file"] = input_file
  591
  592        # Header
  593        header_infos = self.get_header().infos
  594        header_formats = self.get_header().formats
  595        header_infos_list = list(header_infos)
  596        header_formats_list = list(header_formats)
  597
  598        ### Variants
  599
  600        stats["Variants"] = {}
  601
  602        # Variants by chr
  603        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  604        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  605        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  606            by=["CHROM"], kind="quicksort"
  607        )
  608
  609        # Total number of variants
  610        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  611
  612        # Calculate percentage
  613        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  614            lambda x: (x / nb_of_variants)
  615        )
  616
  617        stats["Variants"]["Number of variants by chromosome"] = (
  618            nb_of_variants_by_chrom.to_dict(orient="index")
  619        )
  620
  621        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  622
  623        ### Samples
  624
  625        # Init
  626        samples = {}
  627        nb_of_samples = 0
  628
  629        # Check Samples
  630        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  631            log.debug(f"Check samples...")
  632            for sample in self.get_header_sample_list():
  633                sql_query_samples = f"""
  634                    SELECT  '{sample}' as sample,
  635                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  636                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  637                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  638                    FROM {table_variants_from}
  639                    WHERE (
  640                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  641                        AND
  642                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  643                      )
  644                    GROUP BY genotype
  645                    """
  646                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  647                sample_genotype_count = sql_query_genotype_df["count"].sum()
  648                if len(sql_query_genotype_df):
  649                    nb_of_samples += 1
  650                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  651                        sql_query_genotype_df.to_dict(orient="index")
  652                    )
  653
  654            stats["Samples"] = samples
  655            stats["Infos"]["Number of samples"] = nb_of_samples
  656
  657        # #
  658        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  659        #     stats["Infos"]["Number of samples"] = nb_of_samples
  660        # elif nb_of_samples:
  661        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  662
  663        ### INFO and FORMAT fields
  664        header_types_df = {}
  665        header_types_list = {
  666            "List of INFO fields": header_infos,
  667            "List of FORMAT fields": header_formats,
  668        }
  669        i = 0
  670        for header_type in header_types_list:
  671
  672            header_type_infos = header_types_list.get(header_type)
  673            header_infos_dict = {}
  674
  675            for info in header_type_infos:
  676
  677                i += 1
  678                header_infos_dict[i] = {}
  679
  680                # ID
  681                header_infos_dict[i]["id"] = info
  682
  683                # num
  684                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  685                if header_type_infos[info].num in genotype_map.keys():
  686                    header_infos_dict[i]["Number"] = genotype_map.get(
  687                        header_type_infos[info].num
  688                    )
  689                else:
  690                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  691
  692                # type
  693                if header_type_infos[info].type:
  694                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  695                else:
  696                    header_infos_dict[i]["Type"] = "."
  697
  698                # desc
  699                if header_type_infos[info].desc != None:
  700                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  701                else:
  702                    header_infos_dict[i]["Description"] = ""
  703
  704            if len(header_infos_dict):
  705                header_types_df[header_type] = pd.DataFrame.from_dict(
  706                    header_infos_dict, orient="index"
  707                ).to_dict(orient="index")
  708
  709        # Stats
  710        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  711        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  712        stats["Header"] = header_types_df
  713
  714        ### QUAL
  715        if "QUAL" in self.get_header_columns():
  716            sql_query_qual = f"""
  717                    SELECT
  718                        avg(CAST(QUAL AS INTEGER)) AS Average,
  719                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  720                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  721                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  722                        median(CAST(QUAL AS INTEGER)) AS Median,
  723                        variance(CAST(QUAL AS INTEGER)) AS Variance
  724                    FROM {table_variants_from}
  725                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  726                    """
  727
  728            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  729            stats["Quality"] = {"Stats": qual}
  730
  731        ### SNV and InDel
  732
  733        sql_query_snv = f"""
  734            
  735            SELECT Type, count FROM (
  736
  737                    SELECT
  738                        'Total' AS Type,
  739                        count(*) AS count
  740                    FROM {table_variants_from}
  741
  742                    UNION
  743
  744                    SELECT
  745                        'MNV' AS Type,
  746                        count(*) AS count
  747                    FROM {table_variants_from}
  748                    WHERE len(REF) > 1 AND len(ALT) > 1
  749                    AND len(REF) = len(ALT)
  750
  751                    UNION
  752
  753                    SELECT
  754                        'InDel' AS Type,
  755                        count(*) AS count
  756                    FROM {table_variants_from}
  757                    WHERE len(REF) > 1 OR len(ALT) > 1
  758                    AND len(REF) != len(ALT)
  759                    
  760                    UNION
  761
  762                    SELECT
  763                        'SNV' AS Type,
  764                        count(*) AS count
  765                    FROM {table_variants_from}
  766                    WHERE len(REF) = 1 AND len(ALT) = 1
  767
  768                )
  769
  770            ORDER BY count DESC
  771
  772                """
  773        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  774
  775        sql_query_snv_substitution = f"""
  776                SELECT
  777                    concat(REF, '>', ALT) AS 'Substitution',
  778                    count(*) AS count
  779                FROM {table_variants_from}
  780                WHERE len(REF) = 1 AND len(ALT) = 1
  781                GROUP BY REF, ALT
  782                ORDER BY count(*) DESC
  783                """
  784        snv_substitution = (
  785            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  786        )
  787        stats["Variants"]["Counts"] = snv_indel
  788        stats["Variants"]["Substitutions"] = snv_substitution
  789
  790        return stats
  791
  792    def stats_to_file(self, file: str = None) -> str:
  793        """
  794        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  795        into a JSON object, and writes the JSON object to the specified file.
  796
  797        :param file: The `file` parameter is a string that represents the file path where the JSON data
  798        will be written
  799        :type file: str
  800        :return: the name of the file that was written to.
  801        """
  802
  803        # Get stats
  804        stats = self.get_stats()
  805
  806        # Serializing json
  807        json_object = json.dumps(stats, indent=4)
  808
  809        # Writing to sample.json
  810        with open(file, "w") as outfile:
  811            outfile.write(json_object)
  812
  813        return file
  814
  815    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  816        """
  817        The `print_stats` function generates a markdown file and prints the statistics contained in a
  818        JSON file in a formatted manner.
  819
  820        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  821        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  822        provided, a temporary directory will be created and the stats will be saved in a file named
  823        "stats.md" within that
  824        :type output_file: str
  825        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  826        file where the statistics will be saved. If no value is provided, a temporary directory will be
  827        created and a default file name "stats.json" will be used
  828        :type json_file: str
  829        :return: The function `print_stats` does not return any value. It has a return type annotation
  830        of `None`.
  831        """
  832
  833        # Full path
  834        output_file = full_path(output_file)
  835        json_file = full_path(json_file)
  836
  837        with tempfile.TemporaryDirectory() as tmpdir:
  838
  839            # Files
  840            if not output_file:
  841                output_file = os.path.join(tmpdir, "stats.md")
  842            if not json_file:
  843                json_file = os.path.join(tmpdir, "stats.json")
  844
  845            # Create folders
  846            if not os.path.exists(os.path.dirname(output_file)):
  847                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  848            if not os.path.exists(os.path.dirname(json_file)):
  849                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  850
  851            # Create stats JSON file
  852            stats_file = self.stats_to_file(file=json_file)
  853
  854            # Print stats file
  855            with open(stats_file) as f:
  856                stats = yaml.safe_load(f)
  857
  858            # Output
  859            output_title = []
  860            output_index = []
  861            output = []
  862
  863            # Title
  864            output_title.append("# HOWARD Stats")
  865
  866            # Index
  867            output_index.append("## Index")
  868
  869            # Process sections
  870            for section in stats:
  871                infos = stats.get(section)
  872                section_link = "#" + section.lower().replace(" ", "-")
  873                output.append(f"## {section}")
  874                output_index.append(f"- [{section}]({section_link})")
  875
  876                if len(infos):
  877                    for info in infos:
  878                        try:
  879                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  880                            is_df = True
  881                        except:
  882                            try:
  883                                df = pd.DataFrame.from_dict(
  884                                    json.loads((infos.get(info))), orient="index"
  885                                )
  886                                is_df = True
  887                            except:
  888                                is_df = False
  889                        if is_df:
  890                            output.append(f"### {info}")
  891                            info_link = "#" + info.lower().replace(" ", "-")
  892                            output_index.append(f"   - [{info}]({info_link})")
  893                            output.append(f"{df.to_markdown(index=False)}")
  894                        else:
  895                            output.append(f"- {info}: {infos.get(info)}")
  896                else:
  897                    output.append(f"NA")
  898
  899            # Write stats in markdown file
  900            with open(output_file, "w") as fp:
  901                for item in output_title:
  902                    fp.write("%s\n" % item)
  903                for item in output_index:
  904                    fp.write("%s\n" % item)
  905                for item in output:
  906                    fp.write("%s\n" % item)
  907
  908            # Output stats in markdown
  909            print("")
  910            print("\n\n".join(output_title))
  911            print("")
  912            print("\n\n".join(output))
  913            print("")
  914
  915        return None
  916
  917    def get_input(self) -> str:
  918        """
  919        It returns the value of the input variable.
  920        :return: The input is being returned.
  921        """
  922        return self.input
  923
  924    def get_input_format(self, input_file: str = None) -> str:
  925        """
  926        This function returns the format of the input variable, either from the provided input file or
  927        by prompting for input.
  928
  929        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  930        represents the file path of the input file. If no `input_file` is provided when calling the
  931        method, it will default to `None`
  932        :type input_file: str
  933        :return: The format of the input variable is being returned.
  934        """
  935
  936        if not input_file:
  937            input_file = self.get_input()
  938        input_format = get_file_format(input_file)
  939        return input_format
  940
  941    def get_input_compressed(self, input_file: str = None) -> str:
  942        """
  943        The function `get_input_compressed` returns the format of the input variable after compressing
  944        it.
  945
  946        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  947        that represents the file path of the input file. If no `input_file` is provided when calling the
  948        method, it will default to `None` and the method will then call `self.get_input()` to
  949        :type input_file: str
  950        :return: The function `get_input_compressed` returns the compressed format of the input
  951        variable.
  952        """
  953
  954        if not input_file:
  955            input_file = self.get_input()
  956        input_compressed = get_file_compressed(input_file)
  957        return input_compressed
  958
  959    def get_output(self) -> str:
  960        """
  961        It returns the output of the neuron.
  962        :return: The output of the neural network.
  963        """
  964
  965        return self.output
  966
  967    def get_output_format(self, output_file: str = None) -> str:
  968        """
  969        The function `get_output_format` returns the format of the input variable or the output file if
  970        provided.
  971
  972        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  973        that represents the file path of the output file. If no `output_file` is provided when calling
  974        the method, it will default to the output obtained from the `get_output` method of the class
  975        instance. The
  976        :type output_file: str
  977        :return: The format of the input variable is being returned.
  978        """
  979
  980        if not output_file:
  981            output_file = self.get_output()
  982        output_format = get_file_format(output_file)
  983
  984        return output_format
  985
  986    def get_config(self) -> dict:
  987        """
  988        It returns the config
  989        :return: The config variable is being returned.
  990        """
  991        return self.config
  992
  993    def get_param(self) -> dict:
  994        """
  995        It returns the param
  996        :return: The param variable is being returned.
  997        """
  998        return self.param
  999
 1000    def get_connexion_db(self) -> str:
 1001        """
 1002        It returns the connexion_db attribute of the object
 1003        :return: The connexion_db is being returned.
 1004        """
 1005        return self.connexion_db
 1006
 1007    def get_prefix(self) -> str:
 1008        """
 1009        It returns the prefix of the object.
 1010        :return: The prefix is being returned.
 1011        """
 1012        return self.prefix
 1013
 1014    def get_table_variants(self, clause: str = "select") -> str:
 1015        """
 1016        This function returns the table_variants attribute of the object
 1017
 1018        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1019        defaults to select (optional)
 1020        :return: The table_variants attribute of the object.
 1021        """
 1022
 1023        # Access
 1024        access = self.get_config().get("access", None)
 1025
 1026        # Clauses "select", "where", "update"
 1027        if clause in ["select", "where", "update"]:
 1028            table_variants = self.table_variants
 1029        # Clause "from"
 1030        elif clause in ["from"]:
 1031            # For Read Only
 1032            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1033                input_file = self.get_input()
 1034                table_variants = f"'{input_file}' as variants"
 1035            # For Read Write
 1036            else:
 1037                table_variants = f"{self.table_variants} as variants"
 1038        else:
 1039            table_variants = self.table_variants
 1040        return table_variants
 1041
 1042    def get_tmp_dir(self) -> str:
 1043        """
 1044        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1045        parameters or a default path.
 1046        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1047        configuration, parameters, and a default value of "/tmp".
 1048        """
 1049
 1050        return get_tmp(
 1051            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1052        )
 1053
 1054    def get_connexion_type(self) -> str:
 1055        """
 1056        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1057
 1058        :return: The connexion type is being returned.
 1059        """
 1060        return self.get_config().get("connexion_type", "memory")
 1061
 1062    def get_connexion(self):
 1063        """
 1064        It returns the connection object
 1065
 1066        :return: The connection object.
 1067        """
 1068        return self.conn
 1069
 1070    def close_connexion(self) -> None:
 1071        """
 1072        This function closes the connection to the database.
 1073        :return: The connection is being closed.
 1074        """
 1075        return self.conn.close()
 1076
 1077    def get_header(self, type: str = "vcf"):
 1078        """
 1079        This function returns the header of the VCF file as a list of strings
 1080
 1081        :param type: the type of header you want to get, defaults to vcf (optional)
 1082        :return: The header of the vcf file.
 1083        """
 1084
 1085        if self.header_vcf:
 1086            if type == "vcf":
 1087                return self.header_vcf
 1088            elif type == "list":
 1089                return self.header_list
 1090        else:
 1091            if type == "vcf":
 1092                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1093                return header
 1094            elif type == "list":
 1095                return vcf_required
 1096
 1097    def get_header_infos_list(self) -> list:
 1098        """
 1099        This function retrieves a list of information fields from the header.
 1100        :return: A list of information fields from the header.
 1101        """
 1102
 1103        # Init
 1104        infos_list = []
 1105
 1106        for field in self.get_header().infos:
 1107            infos_list.append(field)
 1108
 1109        return infos_list
 1110
 1111    def get_header_length(self, file: str = None) -> int:
 1112        """
 1113        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1114        line.
 1115
 1116        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1117        header file. If this argument is provided, the function will read the header from the specified
 1118        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1119        :type file: str
 1120        :return: the length of the header list, excluding the #CHROM line.
 1121        """
 1122
 1123        if file:
 1124            return len(self.read_vcf_header_file(file=file)) - 1
 1125        elif self.get_header(type="list"):
 1126            return len(self.get_header(type="list")) - 1
 1127        else:
 1128            return 0
 1129
 1130    def get_header_columns(self) -> str:
 1131        """
 1132        This function returns the header list of a VCF
 1133
 1134        :return: The length of the header list.
 1135        """
 1136        if self.get_header():
 1137            return self.get_header(type="list")[-1]
 1138        else:
 1139            return ""
 1140
 1141    def get_header_columns_as_list(self) -> list:
 1142        """
 1143        This function returns the header list of a VCF
 1144
 1145        :return: The length of the header list.
 1146        """
 1147        if self.get_header():
 1148            return self.get_header_columns().strip().split("\t")
 1149        else:
 1150            return []
 1151
 1152    def get_header_columns_as_sql(self) -> str:
 1153        """
 1154        This function retruns header length (without #CHROM line)
 1155
 1156        :return: The length of the header list.
 1157        """
 1158        sql_column_list = []
 1159        for col in self.get_header_columns_as_list():
 1160            sql_column_list.append(f'"{col}"')
 1161        return ",".join(sql_column_list)
 1162
 1163    def get_header_sample_list(
 1164        self, check: bool = False, samples: list = None, samples_force: bool = False
 1165    ) -> list:
 1166        """
 1167        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1168        checking and filtering based on input parameters.
 1169
 1170        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1171        parameter that determines whether to check if the samples in the list are properly defined as
 1172        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1173        list is defined as a, defaults to False
 1174        :type check: bool (optional)
 1175        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1176        allows you to specify a subset of samples from the header. If you provide a list of sample
 1177        names, the function will check if each sample is defined in the header. If a sample is not found
 1178        in the
 1179        :type samples: list
 1180        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1181        a boolean parameter that determines whether to force the function to return the sample list
 1182        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1183        function will return the sample list without performing, defaults to False
 1184        :type samples_force: bool (optional)
 1185        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1186        parameters and conditions specified in the function.
 1187        """
 1188
 1189        # Init
 1190        samples_list = []
 1191
 1192        if samples is None:
 1193            samples_list = self.header_vcf.samples
 1194        else:
 1195            samples_checked = []
 1196            for sample in samples:
 1197                if sample in self.header_vcf.samples:
 1198                    samples_checked.append(sample)
 1199                else:
 1200                    log.warning(f"Sample '{sample}' not defined in header")
 1201            samples_list = samples_checked
 1202
 1203            # Force sample list without checking if is_genotype_column
 1204            if samples_force:
 1205                log.warning(f"Samples {samples_list} not checked if genotypes")
 1206                return samples_list
 1207
 1208        if check:
 1209            samples_checked = []
 1210            for sample in samples_list:
 1211                if self.is_genotype_column(column=sample):
 1212                    samples_checked.append(sample)
 1213                else:
 1214                    log.warning(
 1215                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1216                    )
 1217            samples_list = samples_checked
 1218
 1219        # Return samples list
 1220        return samples_list
 1221
 1222    def is_genotype_column(self, column: str = None) -> bool:
 1223        """
 1224        This function checks if a given column is a genotype column in a database.
 1225
 1226        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1227        represents the column name in a database table. This method checks if the specified column is a
 1228        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1229        method of
 1230        :type column: str
 1231        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1232        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1233        column name and returns the result. If the `column` parameter is None, it returns False.
 1234        """
 1235
 1236        if column is not None:
 1237            return Database(database=self.get_input()).is_genotype_column(column=column)
 1238        else:
 1239            return False
 1240
 1241    def get_verbose(self) -> bool:
 1242        """
 1243        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1244        exist
 1245
 1246        :return: The value of the key "verbose" in the config dictionary.
 1247        """
 1248        return self.get_config().get("verbose", False)
 1249
 1250    def get_connexion_format(self) -> str:
 1251        """
 1252        It returns the connexion format of the object.
 1253        :return: The connexion_format is being returned.
 1254        """
 1255        connexion_format = self.connexion_format
 1256        if connexion_format not in ["duckdb", "sqlite"]:
 1257            log.error(f"Unknown connexion format {connexion_format}")
 1258            raise ValueError(f"Unknown connexion format {connexion_format}")
 1259        else:
 1260            return connexion_format
 1261
 1262    def insert_file_to_table(
 1263        self,
 1264        file,
 1265        columns: str,
 1266        header_len: int = 0,
 1267        sep: str = "\t",
 1268        chunksize: int = 1000000,
 1269    ) -> None:
 1270        """
 1271        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1272        database format.
 1273
 1274        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1275        the path to the file on your system
 1276        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1277        should contain the names of the columns in the table where the data will be inserted. The column
 1278        names should be separated by commas within the string. For example, if you have columns named
 1279        "id", "name
 1280        :type columns: str
 1281        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1282        the number of lines to skip at the beginning of the file before reading the actual data. This
 1283        parameter allows you to skip any header information present in the file before processing the
 1284        data, defaults to 0
 1285        :type header_len: int (optional)
 1286        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1287        separator character that is used in the file being read. In this case, the default separator is
 1288        set to `\t`, which represents a tab character. You can change this parameter to a different
 1289        separator character if, defaults to \t
 1290        :type sep: str (optional)
 1291        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1292        when processing the file in chunks. In the provided code snippet, the default value for
 1293        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1294        to 1000000
 1295        :type chunksize: int (optional)
 1296        """
 1297
 1298        # Config
 1299        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1300        connexion_format = self.get_connexion_format()
 1301
 1302        log.debug("chunksize: " + str(chunksize))
 1303
 1304        if chunksize:
 1305            for chunk in pd.read_csv(
 1306                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1307            ):
 1308                if connexion_format in ["duckdb"]:
 1309                    sql_insert_into = (
 1310                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1311                    )
 1312                    self.conn.execute(sql_insert_into)
 1313                elif connexion_format in ["sqlite"]:
 1314                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1315
 1316    def load_data(
 1317        self,
 1318        input_file: str = None,
 1319        drop_variants_table: bool = False,
 1320        sample_size: int = 20480,
 1321    ) -> None:
 1322        """
 1323        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1324        table before loading the data and specify a sample size.
 1325
 1326        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1327        table
 1328        :type input_file: str
 1329        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1330        determines whether the variants table should be dropped before loading the data. If set to
 1331        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1332        not be dropped, defaults to False
 1333        :type drop_variants_table: bool (optional)
 1334        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1335        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1336        20480
 1337        :type sample_size: int (optional)
 1338        """
 1339
 1340        log.info("Loading...")
 1341
 1342        # change input file
 1343        if input_file:
 1344            self.set_input(input_file)
 1345            self.set_header()
 1346
 1347        # drop variants table
 1348        if drop_variants_table:
 1349            self.drop_variants_table()
 1350
 1351        # get table variants
 1352        table_variants = self.get_table_variants()
 1353
 1354        # Access
 1355        access = self.get_config().get("access", None)
 1356        log.debug(f"access: {access}")
 1357
 1358        # Input format and compress
 1359        input_format = self.get_input_format()
 1360        input_compressed = self.get_input_compressed()
 1361        log.debug(f"input_format: {input_format}")
 1362        log.debug(f"input_compressed: {input_compressed}")
 1363
 1364        # input_compressed_format
 1365        if input_compressed:
 1366            input_compressed_format = "gzip"
 1367        else:
 1368            input_compressed_format = "none"
 1369        log.debug(f"input_compressed_format: {input_compressed_format}")
 1370
 1371        # Connexion format
 1372        connexion_format = self.get_connexion_format()
 1373
 1374        # Sample size
 1375        if not sample_size:
 1376            sample_size = -1
 1377        log.debug(f"sample_size: {sample_size}")
 1378
 1379        # Load data
 1380        log.debug(f"Load Data from {input_format}")
 1381
 1382        # DuckDB connexion
 1383        if connexion_format in ["duckdb"]:
 1384
 1385            # Database already exists
 1386            if self.input_format in ["db", "duckdb"]:
 1387
 1388                if connexion_format in ["duckdb"]:
 1389                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1390                else:
 1391                    log.error(
 1392                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1393                    )
 1394                    raise ValueError(
 1395                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1396                    )
 1397
 1398            # Load from existing database format
 1399            else:
 1400
 1401                try:
 1402                    # Create Table or View
 1403                    database = Database(database=self.input)
 1404                    sql_from = database.get_sql_from(sample_size=sample_size)
 1405
 1406                    if access in ["RO"]:
 1407                        sql_load = (
 1408                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1409                        )
 1410                    else:
 1411                        sql_load = (
 1412                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1413                        )
 1414                    self.conn.execute(sql_load)
 1415
 1416                except:
 1417                    # Format not available
 1418                    log.error(f"Input file format '{self.input_format}' not available")
 1419                    raise ValueError(
 1420                        f"Input file format '{self.input_format}' not available"
 1421                    )
 1422
 1423        # SQLite connexion
 1424        elif connexion_format in ["sqlite"] and input_format in [
 1425            "vcf",
 1426            "tsv",
 1427            "csv",
 1428            "psv",
 1429        ]:
 1430
 1431            # Main structure
 1432            structure = {
 1433                "#CHROM": "VARCHAR",
 1434                "POS": "INTEGER",
 1435                "ID": "VARCHAR",
 1436                "REF": "VARCHAR",
 1437                "ALT": "VARCHAR",
 1438                "QUAL": "VARCHAR",
 1439                "FILTER": "VARCHAR",
 1440                "INFO": "VARCHAR",
 1441            }
 1442
 1443            # Strcuture with samples
 1444            structure_complete = structure
 1445            if self.get_header_sample_list():
 1446                structure["FORMAT"] = "VARCHAR"
 1447                for sample in self.get_header_sample_list():
 1448                    structure_complete[sample] = "VARCHAR"
 1449
 1450            # Columns list for create and insert
 1451            sql_create_table_columns = []
 1452            sql_create_table_columns_list = []
 1453            for column in structure_complete:
 1454                column_type = structure_complete[column]
 1455                sql_create_table_columns.append(
 1456                    f'"{column}" {column_type} default NULL'
 1457                )
 1458                sql_create_table_columns_list.append(f'"{column}"')
 1459
 1460            # Create database
 1461            log.debug(f"Create Table {table_variants}")
 1462            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1463            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1464            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1465            self.conn.execute(sql_create_table)
 1466
 1467            # chunksize define length of file chunk load file
 1468            chunksize = 100000
 1469
 1470            # delimiter
 1471            delimiter = file_format_delimiters.get(input_format, "\t")
 1472
 1473            # Load the input file
 1474            with open(self.input, "rt") as input_file:
 1475
 1476                # Use the appropriate file handler based on the input format
 1477                if input_compressed:
 1478                    input_file = bgzf.open(self.input, "rt")
 1479                if input_format in ["vcf"]:
 1480                    header_len = self.get_header_length()
 1481                else:
 1482                    header_len = 0
 1483
 1484                # Insert the file contents into a table
 1485                self.insert_file_to_table(
 1486                    input_file,
 1487                    columns=sql_create_table_columns_list_sql,
 1488                    header_len=header_len,
 1489                    sep=delimiter,
 1490                    chunksize=chunksize,
 1491                )
 1492
 1493        else:
 1494            log.error(
 1495                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1496            )
 1497            raise ValueError(
 1498                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1499            )
 1500
 1501        # Explode INFOS fields into table fields
 1502        if self.get_explode_infos():
 1503            self.explode_infos(
 1504                prefix=self.get_explode_infos_prefix(),
 1505                fields=self.get_explode_infos_fields(),
 1506                force=True,
 1507            )
 1508
 1509        # Create index after insertion
 1510        self.create_indexes()
 1511
 1512    def get_explode_infos(self) -> bool:
 1513        """
 1514        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1515        to False if it is not set.
 1516        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1517        value. If the parameter is not present, it will return False.
 1518        """
 1519
 1520        return self.get_param().get("explode", {}).get("explode_infos", False)
 1521
 1522    def get_explode_infos_fields(
 1523        self,
 1524        explode_infos_fields: str = None,
 1525        remove_fields_not_in_header: bool = False,
 1526    ) -> list:
 1527        """
 1528        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1529        the input parameter `explode_infos_fields`.
 1530
 1531        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1532        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1533        comma-separated list of field names to explode
 1534        :type explode_infos_fields: str
 1535        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1536        flag that determines whether to remove fields that are not present in the header. If it is set
 1537        to `True`, any field that is not in the header will be excluded from the list of exploded
 1538        information fields. If it is set to `, defaults to False
 1539        :type remove_fields_not_in_header: bool (optional)
 1540        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1541        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1542        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1543        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1544        splitting the string by commas.
 1545        """
 1546
 1547        # If no fields, get it in param
 1548        if not explode_infos_fields:
 1549            explode_infos_fields = (
 1550                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1551            )
 1552
 1553        # If no fields, defined as all fields in header using keyword
 1554        if not explode_infos_fields:
 1555            explode_infos_fields = "*"
 1556
 1557        # If fields list not empty
 1558        if explode_infos_fields:
 1559
 1560            # Input fields list
 1561            if isinstance(explode_infos_fields, str):
 1562                fields_input = explode_infos_fields.split(",")
 1563            elif isinstance(explode_infos_fields, list):
 1564                fields_input = explode_infos_fields
 1565            else:
 1566                fields_input = []
 1567
 1568            # Fields list without * keyword
 1569            fields_without_all = fields_input.copy()
 1570            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1571                fields_without_all.remove("*")
 1572
 1573            # Fields in header
 1574            fields_in_header = sorted(list(set(self.get_header().infos)))
 1575
 1576            # Construct list of fields
 1577            fields_output = []
 1578            for field in fields_input:
 1579
 1580                # Strip field
 1581                field = field.strip()
 1582
 1583                # format keyword * in regex
 1584                if field.upper() in ["*"]:
 1585                    field = ".*"
 1586
 1587                # Find all fields with pattern
 1588                r = re.compile(field)
 1589                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1590
 1591                # Remove fields input from search
 1592                if field in fields_search:
 1593                    fields_search = [field]
 1594                elif fields_search != [field]:
 1595                    fields_search = sorted(
 1596                        list(set(fields_search).difference(fields_input))
 1597                    )
 1598
 1599                # If field is not in header (avoid not well formatted header)
 1600                if not fields_search and not remove_fields_not_in_header:
 1601                    fields_search = [field]
 1602
 1603                # Add found fields
 1604                for new_field in fields_search:
 1605                    # Add field, if not already exists, and if it is in header (if asked)
 1606                    if (
 1607                        new_field not in fields_output
 1608                        and (
 1609                            not remove_fields_not_in_header
 1610                            or new_field in fields_in_header
 1611                        )
 1612                        and new_field not in [".*"]
 1613                    ):
 1614                        fields_output.append(new_field)
 1615
 1616            return fields_output
 1617
 1618        else:
 1619
 1620            return []
 1621
 1622    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1623        """
 1624        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1625        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1626        not provided.
 1627
 1628        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1629        prefix to be used for exploding or expanding information
 1630        :type explode_infos_prefix: str
 1631        :return: the value of the variable `explode_infos_prefix`.
 1632        """
 1633
 1634        if not explode_infos_prefix:
 1635            explode_infos_prefix = (
 1636                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1637            )
 1638
 1639        return explode_infos_prefix
 1640
 1641    def add_column(
 1642        self,
 1643        table_name,
 1644        column_name,
 1645        column_type,
 1646        default_value=None,
 1647        drop: bool = False,
 1648    ) -> dict:
 1649        """
 1650        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1651        doesn't already exist.
 1652
 1653        :param table_name: The name of the table to which you want to add a column
 1654        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1655        to the table
 1656        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1657        want to add to the table. It should be a string that represents the desired data type, such as
 1658        "INTEGER", "TEXT", "REAL", etc
 1659        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1660        default value for the newly added column. If a default value is provided, it will be assigned to
 1661        the column for any existing rows that do not have a value for that column
 1662        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1663        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1664        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1665        to False
 1666        :type drop: bool (optional)
 1667        :return: a boolean value indicating whether the column was successfully added to the table.
 1668        """
 1669
 1670        # added
 1671        added = False
 1672        dropped = False
 1673
 1674        # Check if the column already exists in the table
 1675        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1676        columns = self.get_query_to_df(query).columns.tolist()
 1677        if column_name.upper() in [c.upper() for c in columns]:
 1678            log.debug(
 1679                f"The {column_name} column already exists in the {table_name} table"
 1680            )
 1681            if drop:
 1682                self.drop_column(table_name=table_name, column_name=column_name)
 1683                dropped = True
 1684            else:
 1685                return None
 1686        else:
 1687            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1688
 1689        # Add column in table
 1690        add_column_query = (
 1691            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1692        )
 1693        if default_value is not None:
 1694            add_column_query += f" DEFAULT {default_value}"
 1695        self.execute_query(add_column_query)
 1696        added = not dropped
 1697        log.debug(
 1698            f"The {column_name} column was successfully added to the {table_name} table"
 1699        )
 1700
 1701        if added:
 1702            added_column = {
 1703                "table_name": table_name,
 1704                "column_name": column_name,
 1705                "column_type": column_type,
 1706                "default_value": default_value,
 1707            }
 1708        else:
 1709            added_column = None
 1710
 1711        return added_column
 1712
 1713    def drop_column(
 1714        self, column: dict = None, table_name: str = None, column_name: str = None
 1715    ) -> bool:
 1716        """
 1717        The `drop_column` function drops a specified column from a given table in a database and returns
 1718        True if the column was successfully dropped, and False if the column does not exist in the
 1719        table.
 1720
 1721        :param column: The `column` parameter is a dictionary that contains information about the column
 1722        you want to drop. It has two keys:
 1723        :type column: dict
 1724        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1725        drop a column
 1726        :type table_name: str
 1727        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1728        from the table
 1729        :type column_name: str
 1730        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1731        and False if the column does not exist in the table.
 1732        """
 1733
 1734        # Find column infos
 1735        if column:
 1736            if isinstance(column, dict):
 1737                table_name = column.get("table_name", None)
 1738                column_name = column.get("column_name", None)
 1739            elif isinstance(column, str):
 1740                table_name = self.get_table_variants()
 1741                column_name = column
 1742            else:
 1743                table_name = None
 1744                column_name = None
 1745
 1746        if not table_name and not column_name:
 1747            return False
 1748
 1749        # Removed
 1750        removed = False
 1751
 1752        # Check if the column already exists in the table
 1753        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1754        columns = self.get_query_to_df(query).columns.tolist()
 1755        if column_name in columns:
 1756            log.debug(f"The {column_name} column exists in the {table_name} table")
 1757        else:
 1758            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1759            return False
 1760
 1761        # Add column in table # ALTER TABLE integers DROP k
 1762        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1763        self.execute_query(add_column_query)
 1764        removed = True
 1765        log.debug(
 1766            f"The {column_name} column was successfully dropped to the {table_name} table"
 1767        )
 1768
 1769        return removed
 1770
 1771    def explode_infos(
 1772        self,
 1773        prefix: str = None,
 1774        create_index: bool = False,
 1775        fields: list = None,
 1776        force: bool = False,
 1777        proccess_all_fields_together: bool = False,
 1778        table: str = None,
 1779    ) -> list:
 1780        """
 1781        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1782        individual columns, returning a list of added columns.
 1783
 1784        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1785        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1786        `self.get_explode_infos_prefix()` as the prefix
 1787        :type prefix: str
 1788        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1789        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1790        `False`, indexes will not be created. The default value is `False`, defaults to False
 1791        :type create_index: bool (optional)
 1792        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1793        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1794        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1795        a list to the `
 1796        :type fields: list
 1797        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1798        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1799        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1800        defaults to False
 1801        :type force: bool (optional)
 1802        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1803        flag that determines whether to process all the INFO fields together or individually. If set to
 1804        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1805        be processed individually. The default value is, defaults to False
 1806        :type proccess_all_fields_together: bool (optional)
 1807        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1808        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1809        a value for the `table` parameter, the function will use that table name. If the `table`
 1810        parameter is
 1811        :type table: str
 1812        :return: The `explode_infos` function returns a list of added columns.
 1813        """
 1814
 1815        # drop indexes
 1816        self.drop_indexes()
 1817
 1818        # connexion format
 1819        connexion_format = self.get_connexion_format()
 1820
 1821        # Access
 1822        access = self.get_config().get("access", None)
 1823
 1824        # Added columns
 1825        added_columns = []
 1826
 1827        if access not in ["RO"]:
 1828
 1829            # prefix
 1830            if prefix in [None, True] or not isinstance(prefix, str):
 1831                if self.get_explode_infos_prefix() not in [None, True]:
 1832                    prefix = self.get_explode_infos_prefix()
 1833                else:
 1834                    prefix = "INFO/"
 1835
 1836            # table variants
 1837            if table is not None:
 1838                table_variants = table
 1839            else:
 1840                table_variants = self.get_table_variants(clause="select")
 1841
 1842            # extra infos
 1843            try:
 1844                extra_infos = self.get_extra_infos()
 1845            except:
 1846                extra_infos = []
 1847
 1848            # Header infos
 1849            header_infos = self.get_header().infos
 1850
 1851            log.debug(
 1852                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1853            )
 1854
 1855            sql_info_alter_table_array = []
 1856
 1857            # Info fields to check
 1858            fields_list = list(header_infos)
 1859            if fields:
 1860                fields_list += fields
 1861            fields_list = set(fields_list)
 1862
 1863            # If no fields
 1864            if not fields:
 1865                fields = []
 1866
 1867            # Translate fields if patterns
 1868            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1869
 1870            for info in fields:
 1871
 1872                info_id_sql = prefix + info
 1873
 1874                if (
 1875                    info in fields_list
 1876                    or prefix + info in fields_list
 1877                    or info in extra_infos
 1878                ):
 1879
 1880                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1881
 1882                    if info in header_infos:
 1883                        info_type = header_infos[info].type
 1884                        info_num = header_infos[info].num
 1885                    else:
 1886                        info_type = "String"
 1887                        info_num = 0
 1888
 1889                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1890                    if info_num != 1:
 1891                        type_sql = "VARCHAR"
 1892
 1893                    # Add field
 1894                    added_column = self.add_column(
 1895                        table_name=table_variants,
 1896                        column_name=info_id_sql,
 1897                        column_type=type_sql,
 1898                        default_value="null",
 1899                        drop=force,
 1900                    )
 1901
 1902                    if added_column:
 1903                        added_columns.append(added_column)
 1904
 1905                    if added_column or force:
 1906
 1907                        # add field to index
 1908                        self.index_additionnal_fields.append(info_id_sql)
 1909
 1910                        # Update field array
 1911                        if connexion_format in ["duckdb"]:
 1912                            update_info_field = f"""
 1913                            "{info_id_sql}" =
 1914                                CASE
 1915                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1916                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1917                                END
 1918                            """
 1919                        elif connexion_format in ["sqlite"]:
 1920                            update_info_field = f"""
 1921                                "{info_id_sql}" =
 1922                                    CASE
 1923                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1924                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1925                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1926                                    END
 1927                            """
 1928
 1929                        sql_info_alter_table_array.append(update_info_field)
 1930
 1931            if sql_info_alter_table_array:
 1932
 1933                # By chromosomes
 1934                try:
 1935                    chromosomes_list = list(
 1936                        self.get_query_to_df(
 1937                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1938                        )["#CHROM"]
 1939                    )
 1940                except:
 1941                    chromosomes_list = [None]
 1942
 1943                for chrom in chromosomes_list:
 1944                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1945
 1946                    # Where clause
 1947                    where_clause = ""
 1948                    if chrom and len(chromosomes_list) > 1:
 1949                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1950
 1951                    # Update table
 1952                    if proccess_all_fields_together:
 1953                        sql_info_alter_table_array_join = ", ".join(
 1954                            sql_info_alter_table_array
 1955                        )
 1956                        if sql_info_alter_table_array_join:
 1957                            sql_info_alter_table = f"""
 1958                                UPDATE {table_variants}
 1959                                SET {sql_info_alter_table_array_join}
 1960                                {where_clause}
 1961                                """
 1962                            log.debug(
 1963                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1964                            )
 1965                            # log.debug(sql_info_alter_table)
 1966                            self.conn.execute(sql_info_alter_table)
 1967                    else:
 1968                        sql_info_alter_num = 0
 1969                        for sql_info_alter in sql_info_alter_table_array:
 1970                            sql_info_alter_num += 1
 1971                            sql_info_alter_table = f"""
 1972                                UPDATE {table_variants}
 1973                                SET {sql_info_alter}
 1974                                {where_clause}
 1975                                """
 1976                            log.debug(
 1977                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1978                            )
 1979                            # log.debug(sql_info_alter_table)
 1980                            self.conn.execute(sql_info_alter_table)
 1981
 1982        # create indexes
 1983        if create_index:
 1984            self.create_indexes()
 1985
 1986        return added_columns
 1987
 1988    def create_indexes(self) -> None:
 1989        """
 1990        Create indexes on the table after insertion
 1991        """
 1992
 1993        # Access
 1994        access = self.get_config().get("access", None)
 1995
 1996        # get table variants
 1997        table_variants = self.get_table_variants("FROM")
 1998
 1999        if self.get_indexing() and access not in ["RO"]:
 2000            # Create index
 2001            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2002            self.conn.execute(sql_create_table_index)
 2003            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2004            self.conn.execute(sql_create_table_index)
 2005            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2006            self.conn.execute(sql_create_table_index)
 2007            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2008            self.conn.execute(sql_create_table_index)
 2009            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2010            self.conn.execute(sql_create_table_index)
 2011            for field in self.index_additionnal_fields:
 2012                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2013                self.conn.execute(sql_create_table_index)
 2014
 2015    def drop_indexes(self) -> None:
 2016        """
 2017        Create indexes on the table after insertion
 2018        """
 2019
 2020        # Access
 2021        access = self.get_config().get("access", None)
 2022
 2023        # get table variants
 2024        table_variants = self.get_table_variants("FROM")
 2025
 2026        # Get database format
 2027        connexion_format = self.get_connexion_format()
 2028
 2029        if access not in ["RO"]:
 2030            if connexion_format in ["duckdb"]:
 2031                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2032            elif connexion_format in ["sqlite"]:
 2033                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2034
 2035            list_indexes = self.conn.execute(sql_list_indexes)
 2036            index_names = [row[0] for row in list_indexes.fetchall()]
 2037            for index in index_names:
 2038                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2039                self.conn.execute(sql_drop_table_index)
 2040
 2041    def read_vcf_header(self, f) -> list:
 2042        """
 2043        It reads the header of a VCF file and returns a list of the header lines
 2044
 2045        :param f: the file object
 2046        :return: The header lines of the VCF file.
 2047        """
 2048
 2049        header_list = []
 2050        for line in f:
 2051            header_list.append(line)
 2052            if line.startswith("#CHROM"):
 2053                break
 2054        return header_list
 2055
 2056    def read_vcf_header_file(self, file: str = None) -> list:
 2057        """
 2058        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2059        uncompressed files.
 2060
 2061        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2062        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2063        default to `None`
 2064        :type file: str
 2065        :return: The function `read_vcf_header_file` returns a list.
 2066        """
 2067
 2068        if self.get_input_compressed(input_file=file):
 2069            with bgzf.open(file, "rt") as f:
 2070                return self.read_vcf_header(f=f)
 2071        else:
 2072            with open(file, "rt") as f:
 2073                return self.read_vcf_header(f=f)
 2074
 2075    def execute_query(self, query: str):
 2076        """
 2077        It takes a query as an argument, executes it, and returns the results
 2078
 2079        :param query: The query to be executed
 2080        :return: The result of the query is being returned.
 2081        """
 2082        if query:
 2083            return self.conn.execute(query)  # .fetchall()
 2084        else:
 2085            return None
 2086
 2087    def export_output(
 2088        self,
 2089        output_file: str | None = None,
 2090        output_header: str | None = None,
 2091        export_header: bool = True,
 2092        query: str | None = None,
 2093        parquet_partitions: list | None = None,
 2094        chunk_size: int | None = None,
 2095        threads: int | None = None,
 2096        sort: bool = False,
 2097        index: bool = False,
 2098        order_by: str | None = None,
 2099    ) -> bool:
 2100        """
 2101        The `export_output` function exports data from a VCF file to a specified output file in various
 2102        formats, including VCF, CSV, TSV, PSV, and Parquet.
 2103
 2104        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2105        output file to be generated by the function. This is where the exported data will be saved
 2106        :type output_file: str
 2107        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2108        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2109        header will be exported to a file with the same name as the `output_file` parameter, but with
 2110        the extension "
 2111        :type output_header: str
 2112        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2113        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2114        True, the header will be exported to a file. If `export_header` is False, the header will not
 2115        be, defaults to True, if output format is not VCF
 2116        :type export_header: bool (optional)
 2117        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 2118        select specific data from the VCF file before exporting it. If provided, only the data that
 2119        matches the query will be exported
 2120        :type query: str
 2121        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2122        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2123        organize data in a hierarchical directory structure based on the values of one or more columns.
 2124        This can improve query performance when working with large datasets
 2125        :type parquet_partitions: list
 2126        :param chunk_size: The `chunk_size` parameter specifies the number of
 2127        records in batch when exporting data in Parquet format. This parameter is used for
 2128        partitioning the Parquet file into multiple files.
 2129        :type chunk_size: int
 2130        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2131        threads to be used during the export process. It determines the level of parallelism and can
 2132        improve the performance of the export operation. If not provided, the function will use the
 2133        default number of threads
 2134        :type threads: int
 2135        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2136        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2137        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2138        False
 2139        :type sort: bool (optional)
 2140        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2141        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2142        no index will be created. The default value is False, defaults to False
 2143        :type index: bool (optional)
 2144        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2145        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2146        :type order_by: str
 2147        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2148        None if it doesn't.
 2149        """
 2150
 2151        # Log
 2152        log.info("Exporting...")
 2153
 2154        # Full path
 2155        output_file = full_path(output_file)
 2156        output_header = full_path(output_header)
 2157
 2158        # Config
 2159        config = self.get_config()
 2160
 2161        # Param
 2162        param = self.get_param()
 2163
 2164        # Tmp files to remove
 2165        tmp_to_remove = []
 2166
 2167        # If no output, get it
 2168        if not output_file:
 2169            output_file = self.get_output()
 2170
 2171        # If not threads
 2172        if not threads:
 2173            threads = self.get_threads()
 2174
 2175        # Auto header name with extension
 2176        if export_header or output_header:
 2177            if not output_header:
 2178                output_header = f"{output_file}.hdr"
 2179            # Export header
 2180            self.export_header(output_file=output_file)
 2181
 2182        # Switch off export header if VCF output
 2183        output_file_type = get_file_format(output_file)
 2184        if output_file_type in ["vcf"]:
 2185            export_header = False
 2186            tmp_to_remove.append(output_header)
 2187
 2188        # Chunk size
 2189        if not chunk_size:
 2190            chunk_size = config.get("chunk_size", None)
 2191
 2192        # Parquet partition
 2193        if not parquet_partitions:
 2194            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2195        if parquet_partitions and isinstance(parquet_partitions, str):
 2196            parquet_partitions = parquet_partitions.split(",")
 2197
 2198        # Order by
 2199        if not order_by:
 2200            order_by = param.get("export", {}).get("order_by", "")
 2201
 2202        # Header in output
 2203        header_in_output = param.get("export", {}).get("include_header", False)
 2204
 2205        # Database
 2206        database_source = self.get_connexion()
 2207
 2208        # Connexion format
 2209        connexion_format = self.get_connexion_format()
 2210
 2211        # Explode infos
 2212        if self.get_explode_infos():
 2213            self.explode_infos(
 2214                prefix=self.get_explode_infos_prefix(),
 2215                fields=self.get_explode_infos_fields(),
 2216                force=False,
 2217            )
 2218
 2219        # if connexion_format in ["sqlite"] or query:
 2220        if connexion_format in ["sqlite"]:
 2221
 2222            # Export in Parquet
 2223            random_tmp = "".join(
 2224                random.choice(string.ascii_lowercase) for i in range(10)
 2225            )
 2226            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2227            tmp_to_remove.append(database_source)
 2228
 2229            # Table Variants
 2230            table_variants = self.get_table_variants()
 2231
 2232            # Create export query
 2233            sql_query_export_subquery = f"""
 2234                SELECT * FROM {table_variants}
 2235                """
 2236
 2237            # Write source file
 2238            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2239
 2240        # Create database
 2241        database = Database(
 2242            database=database_source,
 2243            table="variants",
 2244            header_file=output_header,
 2245            conn_config=self.get_connexion_config(),
 2246        )
 2247
 2248        # Existing colomns header
 2249        existing_columns_header = database.get_header_columns_from_database(query=query)
 2250
 2251        # Sample list
 2252        if output_file_type in ["vcf"]:
 2253            get_samples = self.get_samples()
 2254            get_samples_check = self.get_samples_check()
 2255            samples_force = get_samples is not None
 2256            sample_list = self.get_header_sample_list(
 2257                check=get_samples_check,
 2258                samples=get_samples,
 2259                samples_force=samples_force,
 2260            )
 2261        else:
 2262            sample_list = None
 2263
 2264        # Export file
 2265        database.export(
 2266            output_database=output_file,
 2267            output_header=output_header,
 2268            existing_columns_header=existing_columns_header,
 2269            parquet_partitions=parquet_partitions,
 2270            chunk_size=chunk_size,
 2271            threads=threads,
 2272            sort=sort,
 2273            index=index,
 2274            header_in_output=header_in_output,
 2275            order_by=order_by,
 2276            query=query,
 2277            export_header=export_header,
 2278            sample_list=sample_list,
 2279        )
 2280
 2281        # Remove
 2282        remove_if_exists(tmp_to_remove)
 2283
 2284        return (os.path.exists(output_file) or None) and (
 2285            os.path.exists(output_file) or None
 2286        )
 2287
 2288    def get_extra_infos(self, table: str = None) -> list:
 2289        """
 2290        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2291        in the header.
 2292
 2293        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2294        name of the table from which you want to retrieve the extra columns that are not present in the
 2295        header. If the `table` parameter is not provided when calling the function, it will default to
 2296        using the variants
 2297        :type table: str
 2298        :return: A list of columns that are in the specified table but not in the header of the table.
 2299        """
 2300
 2301        header_columns = []
 2302
 2303        if not table:
 2304            table = self.get_table_variants(clause="from")
 2305            header_columns = self.get_header_columns()
 2306
 2307        # Check all columns in the database
 2308        query = f""" SELECT * FROM {table} LIMIT 1 """
 2309        log.debug(f"query {query}")
 2310        table_columns = self.get_query_to_df(query).columns.tolist()
 2311        extra_columns = []
 2312
 2313        # Construct extra infos (not in header)
 2314        for column in table_columns:
 2315            if column not in header_columns:
 2316                extra_columns.append(column)
 2317
 2318        return extra_columns
 2319
 2320    def get_extra_infos_sql(self, table: str = None) -> str:
 2321        """
 2322        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2323        by double quotes
 2324
 2325        :param table: The name of the table to get the extra infos from. If None, the default table is
 2326        used
 2327        :type table: str
 2328        :return: A string of the extra infos
 2329        """
 2330
 2331        return ", ".join(
 2332            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2333        )
 2334
 2335    def export_header(
 2336        self,
 2337        header_name: str = None,
 2338        output_file: str = None,
 2339        output_file_ext: str = ".hdr",
 2340        clean_header: bool = True,
 2341        remove_chrom_line: bool = False,
 2342    ) -> str:
 2343        """
 2344        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2345        specified options, and writes it to a new file.
 2346
 2347        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2348        this parameter is not specified, the header will be written to the output file
 2349        :type header_name: str
 2350        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2351        specify the name of the output file where the header will be written. If this parameter is not
 2352        provided, the header will be written to a temporary file
 2353        :type output_file: str
 2354        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2355        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2356        if not specified by the user. This extension will be appended to the `output_file` name to
 2357        create the final, defaults to .hdr
 2358        :type output_file_ext: str (optional)
 2359        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2360        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2361        `True`, the function will clean the header by modifying certain lines based on a specific
 2362        pattern. If `clean_header`, defaults to True
 2363        :type clean_header: bool (optional)
 2364        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2365        boolean flag that determines whether the #CHROM line should be removed from the header before
 2366        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2367        defaults to False
 2368        :type remove_chrom_line: bool (optional)
 2369        :return: The function `export_header` returns the name of the temporary header file that is
 2370        created.
 2371        """
 2372
 2373        if not header_name and not output_file:
 2374            output_file = self.get_output()
 2375
 2376        if self.get_header():
 2377
 2378            # Get header object
 2379            header_obj = self.get_header()
 2380
 2381            # Create database
 2382            db_for_header = Database(database=self.get_input())
 2383
 2384            # Get real columns in the file
 2385            db_header_columns = db_for_header.get_columns()
 2386
 2387            with tempfile.TemporaryDirectory() as tmpdir:
 2388
 2389                # Write header file
 2390                header_file_tmp = os.path.join(tmpdir, "header")
 2391                f = open(header_file_tmp, "w")
 2392                vcf.Writer(f, header_obj)
 2393                f.close()
 2394
 2395                # Replace #CHROM line with rel columns
 2396                header_list = db_for_header.read_header_file(
 2397                    header_file=header_file_tmp
 2398                )
 2399                header_list[-1] = "\t".join(db_header_columns)
 2400
 2401                # Remove CHROM line
 2402                if remove_chrom_line:
 2403                    header_list.pop()
 2404
 2405                # Clean header
 2406                if clean_header:
 2407                    header_list_clean = []
 2408                    for head in header_list:
 2409                        # Clean head for malformed header
 2410                        head_clean = head
 2411                        head_clean = re.subn(
 2412                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2413                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2414                            head_clean,
 2415                            2,
 2416                        )[0]
 2417                        # Write header
 2418                        header_list_clean.append(head_clean)
 2419                    header_list = header_list_clean
 2420
 2421            tmp_header_name = output_file + output_file_ext
 2422
 2423            f = open(tmp_header_name, "w")
 2424            for line in header_list:
 2425                f.write(line)
 2426            f.close()
 2427
 2428        return tmp_header_name
 2429
 2430    def export_variant_vcf(
 2431        self,
 2432        vcf_file,
 2433        remove_info: bool = False,
 2434        add_samples: bool = True,
 2435        list_samples: list = [],
 2436        where_clause: str = "",
 2437        index: bool = False,
 2438        threads: int | None = None,
 2439    ) -> bool | None:
 2440        """
 2441        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2442        remove INFO field, add samples, and control compression and indexing.
 2443
 2444        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2445        written to. It is the output file that will contain the filtered VCF data based on the specified
 2446        parameters
 2447        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2448        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2449        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2450        in, defaults to False
 2451        :type remove_info: bool (optional)
 2452        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2453        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2454        If set to False, the samples will be removed. The default value is True, defaults to True
 2455        :type add_samples: bool (optional)
 2456        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2457        in the output VCF file. By default, all samples will be included. If you provide a list of
 2458        samples, only those samples will be included in the output file
 2459        :type list_samples: list
 2460        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2461        determines whether or not to create an index for the output VCF file. If `index` is set to
 2462        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2463        :type index: bool (optional)
 2464        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2465        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2466        will be used during the export process. More threads can potentially speed up the export process
 2467        by utilizing multiple cores of the processor. If
 2468        :type threads: int | None
 2469        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2470        method with various parameters including the output file, query, threads, sort flag, and index
 2471        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2472        specified parameters and configurations provided in the `export_variant_vcf` function.
 2473        """
 2474
 2475        # Config
 2476        config = self.get_config()
 2477
 2478        # Extract VCF
 2479        log.debug("Export VCF...")
 2480
 2481        # Table variants
 2482        table_variants = self.get_table_variants()
 2483
 2484        # Threads
 2485        if not threads:
 2486            threads = self.get_threads()
 2487
 2488        # Info fields
 2489        if remove_info:
 2490            if not isinstance(remove_info, str):
 2491                remove_info = "."
 2492            info_field = f"""'{remove_info}' as INFO"""
 2493        else:
 2494            info_field = "INFO"
 2495
 2496        # Samples fields
 2497        if add_samples:
 2498            if not list_samples:
 2499                list_samples = self.get_header_sample_list()
 2500            if list_samples:
 2501                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2502            else:
 2503                samples_fields = ""
 2504            log.debug(f"samples_fields: {samples_fields}")
 2505        else:
 2506            samples_fields = ""
 2507
 2508        # Where clause
 2509        if where_clause is None:
 2510            where_clause = ""
 2511
 2512        # Variants
 2513        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2514        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2515        log.debug(f"sql_query_select={sql_query_select}")
 2516
 2517        return self.export_output(
 2518            output_file=vcf_file,
 2519            output_header=None,
 2520            export_header=True,
 2521            query=sql_query_select,
 2522            parquet_partitions=None,
 2523            chunk_size=config.get("chunk_size", None),
 2524            threads=threads,
 2525            sort=True,
 2526            index=index,
 2527            order_by=None,
 2528        )
 2529
 2530    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2531        """
 2532        It takes a list of commands and runs them in parallel using the number of threads specified
 2533
 2534        :param commands: A list of commands to run
 2535        :param threads: The number of threads to use, defaults to 1 (optional)
 2536        """
 2537
 2538        run_parallel_commands(commands, threads)
 2539
 2540    def get_threads(self, default: int = 1) -> int:
 2541        """
 2542        This function returns the number of threads to use for a job, with a default value of 1 if not
 2543        specified.
 2544
 2545        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2546        default number of threads to use if no specific value is provided. If no value is provided for
 2547        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2548        used, defaults to 1
 2549        :type default: int (optional)
 2550        :return: the number of threads to use for the current job.
 2551        """
 2552
 2553        # Config
 2554        config = self.get_config()
 2555
 2556        # Param
 2557        param = self.get_param()
 2558
 2559        # Input threads
 2560        input_thread = param.get("threads", config.get("threads", None))
 2561
 2562        # Check threads
 2563        if not input_thread:
 2564            threads = default
 2565        elif int(input_thread) <= 0:
 2566            threads = os.cpu_count()
 2567        else:
 2568            threads = int(input_thread)
 2569        return threads
 2570
 2571    def get_memory(self, default: str = None) -> str:
 2572        """
 2573        This function retrieves the memory value from parameters or configuration with a default value
 2574        if not found.
 2575
 2576        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2577        default value is used as a fallback in case the `memory` parameter is not provided in the
 2578        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2579        the function
 2580        :type default: str
 2581        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2582        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2583        return the default value provided as an argument to the function.
 2584        """
 2585
 2586        # Config
 2587        config = self.get_config()
 2588
 2589        # Param
 2590        param = self.get_param()
 2591
 2592        # Input threads
 2593        input_memory = param.get("memory", config.get("memory", None))
 2594
 2595        # Check threads
 2596        if input_memory:
 2597            memory = input_memory
 2598        else:
 2599            memory = default
 2600
 2601        return memory
 2602
 2603    def update_from_vcf(self, vcf_file: str) -> None:
 2604        """
 2605        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2606
 2607        :param vcf_file: the path to the VCF file
 2608        """
 2609
 2610        connexion_format = self.get_connexion_format()
 2611
 2612        if connexion_format in ["duckdb"]:
 2613            self.update_from_vcf_duckdb(vcf_file)
 2614        elif connexion_format in ["sqlite"]:
 2615            self.update_from_vcf_sqlite(vcf_file)
 2616
 2617    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2618        """
 2619        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2620        INFO column of the VCF file
 2621
 2622        :param vcf_file: the path to the VCF file
 2623        """
 2624
 2625        # varaints table
 2626        table_variants = self.get_table_variants()
 2627
 2628        # Loading VCF into temporaire table
 2629        skip = self.get_header_length(file=vcf_file)
 2630        vcf_df = pd.read_csv(
 2631            vcf_file,
 2632            sep="\t",
 2633            engine="c",
 2634            skiprows=skip,
 2635            header=0,
 2636            low_memory=False,
 2637        )
 2638        sql_query_update = f"""
 2639        UPDATE {table_variants} as table_variants
 2640            SET INFO = concat(
 2641                            CASE
 2642                                WHEN INFO NOT IN ('', '.')
 2643                                THEN INFO
 2644                                ELSE ''
 2645                            END,
 2646                            (
 2647                                SELECT 
 2648                                    concat(
 2649                                        CASE
 2650                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2651                                            THEN ';'
 2652                                            ELSE ''
 2653                                        END
 2654                                        ,
 2655                                        CASE
 2656                                            WHEN table_parquet.INFO NOT IN ('','.')
 2657                                            THEN table_parquet.INFO
 2658                                            ELSE ''
 2659                                        END
 2660                                    )
 2661                                FROM vcf_df as table_parquet
 2662                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2663                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2664                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2665                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2666                                        AND table_parquet.INFO NOT IN ('','.')
 2667                            )
 2668                        )
 2669            ;
 2670            """
 2671        self.conn.execute(sql_query_update)
 2672
 2673    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2674        """
 2675        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2676        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2677        table
 2678
 2679        :param vcf_file: The path to the VCF file you want to update the database with
 2680        """
 2681
 2682        # Create a temporary table for the VCF
 2683        table_vcf = "tmp_vcf"
 2684        sql_create = (
 2685            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2686        )
 2687        self.conn.execute(sql_create)
 2688
 2689        # Loading VCF into temporaire table
 2690        vcf_df = pd.read_csv(
 2691            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2692        )
 2693        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2694        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2695
 2696        # Update table 'variants' with VCF data
 2697        # warning: CONCAT as || operator
 2698        sql_query_update = f"""
 2699            UPDATE variants as table_variants
 2700            SET INFO = CASE
 2701                            WHEN INFO NOT IN ('', '.')
 2702                            THEN INFO
 2703                            ELSE ''
 2704                        END ||
 2705                        (
 2706                        SELECT 
 2707                            CASE 
 2708                                WHEN table_variants.INFO NOT IN ('','.') 
 2709                                    AND table_vcf.INFO NOT IN ('','.')  
 2710                                THEN ';' 
 2711                                ELSE '' 
 2712                            END || 
 2713                            CASE 
 2714                                WHEN table_vcf.INFO NOT IN ('','.') 
 2715                                THEN table_vcf.INFO 
 2716                                ELSE '' 
 2717                            END
 2718                        FROM {table_vcf} as table_vcf
 2719                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2720                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2721                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2722                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2723                        )
 2724        """
 2725        self.conn.execute(sql_query_update)
 2726
 2727        # Drop temporary table
 2728        sql_drop = f"DROP TABLE {table_vcf}"
 2729        self.conn.execute(sql_drop)
 2730
 2731    def drop_variants_table(self) -> None:
 2732        """
 2733        > This function drops the variants table
 2734        """
 2735
 2736        table_variants = self.get_table_variants()
 2737        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2738        self.conn.execute(sql_table_variants)
 2739
 2740    def set_variant_id(
 2741        self, variant_id_column: str = "variant_id", force: bool = None
 2742    ) -> str:
 2743        """
 2744        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2745        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2746
 2747        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2748        to variant_id
 2749        :type variant_id_column: str (optional)
 2750        :param force: If True, the variant_id column will be created even if it already exists
 2751        :type force: bool
 2752        :return: The name of the column that contains the variant_id
 2753        """
 2754
 2755        # Assembly
 2756        assembly = self.get_param().get(
 2757            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2758        )
 2759
 2760        # INFO/Tag prefix
 2761        prefix = self.get_explode_infos_prefix()
 2762
 2763        # Explode INFO/SVTYPE
 2764        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2765
 2766        # variants table
 2767        table_variants = self.get_table_variants()
 2768
 2769        # variant_id column
 2770        if not variant_id_column:
 2771            variant_id_column = "variant_id"
 2772
 2773        # Creta variant_id column
 2774        if "variant_id" not in self.get_extra_infos() or force:
 2775
 2776            # Create column
 2777            self.add_column(
 2778                table_name=table_variants,
 2779                column_name=variant_id_column,
 2780                column_type="UBIGINT",
 2781                default_value="0",
 2782            )
 2783
 2784            # Update column
 2785            self.conn.execute(
 2786                f"""
 2787                    UPDATE {table_variants}
 2788                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2789                """
 2790            )
 2791
 2792        # Remove added columns
 2793        for added_column in added_columns:
 2794            self.drop_column(column=added_column)
 2795
 2796        # return variant_id column name
 2797        return variant_id_column
 2798
 2799    def get_variant_id_column(
 2800        self, variant_id_column: str = "variant_id", force: bool = None
 2801    ) -> str:
 2802        """
 2803        This function returns the variant_id column name
 2804
 2805        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2806        defaults to variant_id
 2807        :type variant_id_column: str (optional)
 2808        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2809        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2810        if it is not already set, or if it is set
 2811        :type force: bool
 2812        :return: The variant_id column name.
 2813        """
 2814
 2815        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2816
 2817    ###
 2818    # Annotation
 2819    ###
 2820
 2821    def scan_databases(
 2822        self,
 2823        database_formats: list = ["parquet"],
 2824        database_releases: list = ["current"],
 2825    ) -> dict:
 2826        """
 2827        The function `scan_databases` scans for available databases based on specified formats and
 2828        releases.
 2829
 2830        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2831        of the databases to be scanned. In this case, the accepted format is "parquet"
 2832        :type database_formats: list ["parquet"]
 2833        :param database_releases: The `database_releases` parameter is a list that specifies the
 2834        releases of the databases to be scanned. In the provided function, the default value for
 2835        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2836        databases that are in the "current"
 2837        :type database_releases: list
 2838        :return: The function `scan_databases` returns a dictionary containing information about
 2839        databases that match the specified formats and releases.
 2840        """
 2841
 2842        # Config
 2843        config = self.get_config()
 2844
 2845        # Param
 2846        param = self.get_param()
 2847
 2848        # Param - Assembly
 2849        assembly = param.get("assembly", config.get("assembly", None))
 2850        if not assembly:
 2851            assembly = DEFAULT_ASSEMBLY
 2852            log.warning(f"Default assembly '{assembly}'")
 2853
 2854        # Scan for availabled databases
 2855        log.info(
 2856            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2857        )
 2858        databases_infos_dict = databases_infos(
 2859            database_folder_releases=database_releases,
 2860            database_formats=database_formats,
 2861            assembly=assembly,
 2862            config=config,
 2863        )
 2864        log.info(
 2865            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2866        )
 2867
 2868        return databases_infos_dict
 2869
 2870    def annotation(self) -> None:
 2871        """
 2872        It annotates the VCF file with the annotations specified in the config file.
 2873        """
 2874
 2875        # Config
 2876        config = self.get_config()
 2877
 2878        # Param
 2879        param = self.get_param()
 2880
 2881        # Param - Assembly
 2882        assembly = param.get("assembly", config.get("assembly", None))
 2883        if not assembly:
 2884            assembly = DEFAULT_ASSEMBLY
 2885            log.warning(f"Default assembly '{assembly}'")
 2886
 2887        # annotations databases folders
 2888        annotations_databases = set(
 2889            config.get("folders", {})
 2890            .get("databases", {})
 2891            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2892            + config.get("folders", {})
 2893            .get("databases", {})
 2894            .get("parquet", ["~/howard/databases/parquet/current"])
 2895            + config.get("folders", {})
 2896            .get("databases", {})
 2897            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2898        )
 2899
 2900        # Get param annotations
 2901        if param.get("annotations", None) and isinstance(
 2902            param.get("annotations", None), str
 2903        ):
 2904            log.debug(param.get("annotations", None))
 2905            param_annotation_list = param.get("annotations").split(",")
 2906        else:
 2907            param_annotation_list = []
 2908
 2909        # Each tools param
 2910        if param.get("annotation_parquet", None) != None:
 2911            log.debug(
 2912                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2913            )
 2914            if isinstance(param.get("annotation_parquet", None), list):
 2915                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2916            else:
 2917                param_annotation_list.append(param.get("annotation_parquet"))
 2918        if param.get("annotation_snpsift", None) != None:
 2919            if isinstance(param.get("annotation_snpsift", None), list):
 2920                param_annotation_list.append(
 2921                    "snpsift:"
 2922                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2923                )
 2924            else:
 2925                param_annotation_list.append(
 2926                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2927                )
 2928        if param.get("annotation_snpeff", None) != None:
 2929            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2930        if param.get("annotation_bcftools", None) != None:
 2931            if isinstance(param.get("annotation_bcftools", None), list):
 2932                param_annotation_list.append(
 2933                    "bcftools:"
 2934                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2935                )
 2936            else:
 2937                param_annotation_list.append(
 2938                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2939                )
 2940        if param.get("annotation_annovar", None) != None:
 2941            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2942        if param.get("annotation_exomiser", None) != None:
 2943            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2944        if param.get("annotation_splice", None) != None:
 2945            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2946
 2947        # Merge param annotations list
 2948        param["annotations"] = ",".join(param_annotation_list)
 2949
 2950        # debug
 2951        log.debug(f"param_annotations={param['annotations']}")
 2952
 2953        if param.get("annotations"):
 2954
 2955            # Log
 2956            # log.info("Annotations - Check annotation parameters")
 2957
 2958            if not "annotation" in param:
 2959                param["annotation"] = {}
 2960
 2961            # List of annotations parameters
 2962            annotations_list_input = {}
 2963            if isinstance(param.get("annotations", None), str):
 2964                annotation_file_list = [
 2965                    value for value in param.get("annotations", "").split(",")
 2966                ]
 2967                for annotation_file in annotation_file_list:
 2968                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2969            else:
 2970                annotations_list_input = param.get("annotations", {})
 2971
 2972            log.info(f"Quick Annotations:")
 2973            for annotation_key in list(annotations_list_input.keys()):
 2974                log.info(f"   {annotation_key}")
 2975
 2976            # List of annotations and associated fields
 2977            annotations_list = {}
 2978
 2979            for annotation_file in annotations_list_input:
 2980
 2981                # Explode annotations if ALL
 2982                if (
 2983                    annotation_file.upper() == "ALL"
 2984                    or annotation_file.upper().startswith("ALL:")
 2985                ):
 2986
 2987                    # check ALL parameters (formats, releases)
 2988                    annotation_file_split = annotation_file.split(":")
 2989                    database_formats = "parquet"
 2990                    database_releases = "current"
 2991                    for annotation_file_option in annotation_file_split[1:]:
 2992                        database_all_options_split = annotation_file_option.split("=")
 2993                        if database_all_options_split[0] == "format":
 2994                            database_formats = database_all_options_split[1].split("+")
 2995                        if database_all_options_split[0] == "release":
 2996                            database_releases = database_all_options_split[1].split("+")
 2997
 2998                    # Scan for availabled databases
 2999                    databases_infos_dict = self.scan_databases(
 3000                        database_formats=database_formats,
 3001                        database_releases=database_releases,
 3002                    )
 3003
 3004                    # Add found databases in annotation parameters
 3005                    for database_infos in databases_infos_dict.keys():
 3006                        annotations_list[database_infos] = {"INFO": None}
 3007
 3008                else:
 3009                    annotations_list[annotation_file] = annotations_list_input[
 3010                        annotation_file
 3011                    ]
 3012
 3013            # Check each databases
 3014            if len(annotations_list):
 3015
 3016                log.info(
 3017                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3018                )
 3019
 3020                for annotation_file in annotations_list:
 3021
 3022                    # Init
 3023                    annotations = annotations_list.get(annotation_file, None)
 3024
 3025                    # Annotation snpEff
 3026                    if annotation_file.startswith("snpeff"):
 3027
 3028                        log.debug(f"Quick Annotation snpEff")
 3029
 3030                        if "snpeff" not in param["annotation"]:
 3031                            param["annotation"]["snpeff"] = {}
 3032
 3033                        if "options" not in param["annotation"]["snpeff"]:
 3034                            param["annotation"]["snpeff"]["options"] = ""
 3035
 3036                        # snpEff options in annotations
 3037                        param["annotation"]["snpeff"]["options"] = "".join(
 3038                            annotation_file.split(":")[1:]
 3039                        )
 3040
 3041                    # Annotation Annovar
 3042                    elif annotation_file.startswith("annovar"):
 3043
 3044                        log.debug(f"Quick Annotation Annovar")
 3045
 3046                        if "annovar" not in param["annotation"]:
 3047                            param["annotation"]["annovar"] = {}
 3048
 3049                        if "annotations" not in param["annotation"]["annovar"]:
 3050                            param["annotation"]["annovar"]["annotations"] = {}
 3051
 3052                        # Options
 3053                        annotation_file_split = annotation_file.split(":")
 3054                        for annotation_file_annotation in annotation_file_split[1:]:
 3055                            if annotation_file_annotation:
 3056                                param["annotation"]["annovar"]["annotations"][
 3057                                    annotation_file_annotation
 3058                                ] = annotations
 3059
 3060                    # Annotation Exomiser
 3061                    elif annotation_file.startswith("exomiser"):
 3062
 3063                        log.debug(f"Quick Annotation Exomiser")
 3064
 3065                        param["annotation"]["exomiser"] = params_string_to_dict(
 3066                            annotation_file
 3067                        )
 3068
 3069                    # Annotation Splice
 3070                    elif annotation_file.startswith("splice"):
 3071
 3072                        log.debug(f"Quick Annotation Splice")
 3073
 3074                        param["annotation"]["splice"] = params_string_to_dict(
 3075                            annotation_file
 3076                        )
 3077
 3078                    # Annotation Parquet or BCFTOOLS
 3079                    else:
 3080
 3081                        # Tools detection
 3082                        if annotation_file.startswith("bcftools:"):
 3083                            annotation_tool_initial = "bcftools"
 3084                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3085                        elif annotation_file.startswith("snpsift:"):
 3086                            annotation_tool_initial = "snpsift"
 3087                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3088                        elif annotation_file.startswith("bigwig:"):
 3089                            annotation_tool_initial = "bigwig"
 3090                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3091                        else:
 3092                            annotation_tool_initial = None
 3093
 3094                        # list of files
 3095                        annotation_file_list = annotation_file.replace("+", ":").split(
 3096                            ":"
 3097                        )
 3098
 3099                        for annotation_file in annotation_file_list:
 3100
 3101                            if annotation_file:
 3102
 3103                                # Annotation tool initial
 3104                                annotation_tool = annotation_tool_initial
 3105
 3106                                # Find file
 3107                                annotation_file_found = None
 3108
 3109                                if os.path.exists(annotation_file):
 3110                                    annotation_file_found = annotation_file
 3111                                elif os.path.exists(full_path(annotation_file)):
 3112                                    annotation_file_found = full_path(annotation_file)
 3113                                else:
 3114                                    # Find within assembly folders
 3115                                    for annotations_database in annotations_databases:
 3116                                        found_files = find_all(
 3117                                            annotation_file,
 3118                                            os.path.join(
 3119                                                annotations_database, assembly
 3120                                            ),
 3121                                        )
 3122                                        if len(found_files) > 0:
 3123                                            annotation_file_found = found_files[0]
 3124                                            break
 3125                                    if not annotation_file_found and not assembly:
 3126                                        # Find within folders
 3127                                        for (
 3128                                            annotations_database
 3129                                        ) in annotations_databases:
 3130                                            found_files = find_all(
 3131                                                annotation_file, annotations_database
 3132                                            )
 3133                                            if len(found_files) > 0:
 3134                                                annotation_file_found = found_files[0]
 3135                                                break
 3136                                log.debug(
 3137                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3138                                )
 3139
 3140                                # Full path
 3141                                annotation_file_found = full_path(annotation_file_found)
 3142
 3143                                if annotation_file_found:
 3144
 3145                                    database = Database(database=annotation_file_found)
 3146                                    quick_annotation_format = database.get_format()
 3147                                    quick_annotation_is_compressed = (
 3148                                        database.is_compressed()
 3149                                    )
 3150                                    quick_annotation_is_indexed = os.path.exists(
 3151                                        f"{annotation_file_found}.tbi"
 3152                                    )
 3153                                    bcftools_preference = False
 3154
 3155                                    # Check Annotation Tool
 3156                                    if not annotation_tool:
 3157                                        if (
 3158                                            bcftools_preference
 3159                                            and quick_annotation_format
 3160                                            in ["vcf", "bed"]
 3161                                            and quick_annotation_is_compressed
 3162                                            and quick_annotation_is_indexed
 3163                                        ):
 3164                                            annotation_tool = "bcftools"
 3165                                        elif quick_annotation_format in [
 3166                                            "vcf",
 3167                                            "bed",
 3168                                            "tsv",
 3169                                            "tsv",
 3170                                            "csv",
 3171                                            "json",
 3172                                            "tbl",
 3173                                            "parquet",
 3174                                            "duckdb",
 3175                                        ]:
 3176                                            annotation_tool = "parquet"
 3177                                        elif quick_annotation_format in [
 3178                                            "bw"
 3179                                        ]:
 3180                                            annotation_tool = "bigwig"
 3181                                        else:
 3182                                            log.error(
 3183                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3184                                            )
 3185                                            raise ValueError(
 3186                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3187                                            )
 3188
 3189                                    log.debug(
 3190                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3191                                    )
 3192
 3193                                    # Annotation Tool dispatch
 3194                                    if annotation_tool:
 3195                                        if annotation_tool not in param["annotation"]:
 3196                                            param["annotation"][annotation_tool] = {}
 3197                                        if (
 3198                                            "annotations"
 3199                                            not in param["annotation"][annotation_tool]
 3200                                        ):
 3201                                            param["annotation"][annotation_tool][
 3202                                                "annotations"
 3203                                            ] = {}
 3204                                        param["annotation"][annotation_tool][
 3205                                            "annotations"
 3206                                        ][annotation_file_found] = annotations
 3207
 3208                                else:
 3209                                    log.warning(
 3210                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3211                                    )
 3212
 3213                self.set_param(param)
 3214
 3215        if param.get("annotation", None):
 3216            log.info("Annotations")
 3217            if param.get("annotation", {}).get("parquet", None):
 3218                log.info("Annotations 'parquet'...")
 3219                self.annotation_parquet()
 3220            if param.get("annotation", {}).get("bcftools", None):
 3221                log.info("Annotations 'bcftools'...")
 3222                self.annotation_bcftools()
 3223            if param.get("annotation", {}).get("snpsift", None):
 3224                log.info("Annotations 'snpsift'...")
 3225                self.annotation_snpsift()
 3226            if param.get("annotation", {}).get("bigwig", None):
 3227                log.info("Annotations 'bigwig'...")
 3228                self.annotation_bigwig()
 3229            if param.get("annotation", {}).get("annovar", None):
 3230                log.info("Annotations 'annovar'...")
 3231                self.annotation_annovar()
 3232            if param.get("annotation", {}).get("snpeff", None):
 3233                log.info("Annotations 'snpeff'...")
 3234                self.annotation_snpeff()
 3235            if param.get("annotation", {}).get("exomiser", None) is not None:
 3236                log.info("Annotations 'exomiser'...")
 3237                self.annotation_exomiser()
 3238            if param.get("annotation", {}).get("splice", None) is not None:
 3239                log.info("Annotations 'splice' ...")
 3240                self.annotation_splice()
 3241
 3242        # Explode INFOS fields into table fields
 3243        if self.get_explode_infos():
 3244            self.explode_infos(
 3245                prefix=self.get_explode_infos_prefix(),
 3246                fields=self.get_explode_infos_fields(),
 3247                force=True,
 3248            )
 3249
 3250
 3251    def annotation_bigwig(self, threads: int = None) -> None:
 3252        """
 3253        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3254        
 3255        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3256        number of threads to be used for parallel processing during the annotation process. If the
 3257        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3258        threads to use based on the system configuration
 3259        :type threads: int
 3260        :return: True
 3261        """
 3262
 3263        # DEBUG
 3264        log.debug("Start annotation with bigwig databases")
 3265
 3266        # # Threads
 3267        # if not threads:
 3268        #     threads = self.get_threads()
 3269        # log.debug("Threads: " + str(threads))
 3270
 3271        # Config
 3272        config = self.get_config()
 3273        log.debug("Config: " + str(config))
 3274
 3275        # Config - BCFTools databases folders
 3276        databases_folders = set(
 3277            self.get_config()
 3278            .get("folders", {})
 3279            .get("databases", {})
 3280            .get("annotations", ["."])
 3281            + self.get_config()
 3282            .get("folders", {})
 3283            .get("databases", {})
 3284            .get("bigwig", ["."])
 3285        )
 3286        log.debug("Databases annotations: " + str(databases_folders))
 3287
 3288        # Param
 3289        annotations = (
 3290            self.get_param()
 3291            .get("annotation", {})
 3292            .get("bigwig", {})
 3293            .get("annotations", None)
 3294        )
 3295        log.debug("Annotations: " + str(annotations))
 3296
 3297        # Assembly
 3298        assembly = self.get_param().get(
 3299            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3300        )
 3301
 3302        # Data
 3303        table_variants = self.get_table_variants()
 3304
 3305        # Check if not empty
 3306        log.debug("Check if not empty")
 3307        sql_query_chromosomes = (
 3308            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3309        )
 3310        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3311        if not sql_query_chromosomes_df["count"][0]:
 3312            log.info(f"VCF empty")
 3313            return
 3314
 3315        # VCF header
 3316        vcf_reader = self.get_header()
 3317        log.debug("Initial header: " + str(vcf_reader.infos))
 3318
 3319        # Existing annotations
 3320        for vcf_annotation in self.get_header().infos:
 3321
 3322            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3323            log.debug(
 3324                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3325            )
 3326
 3327        if annotations:
 3328
 3329            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3330
 3331                # Export VCF file
 3332                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3333
 3334                # annotation_bigwig_config
 3335                annotation_bigwig_config_list = []
 3336
 3337                for annotation in annotations:
 3338                    annotation_fields = annotations[annotation]
 3339
 3340                    # Annotation Name
 3341                    annotation_name = os.path.basename(annotation)
 3342
 3343                    if not annotation_fields:
 3344                        annotation_fields = {"INFO": None}
 3345
 3346                    log.debug(f"Annotation '{annotation_name}'")
 3347                    log.debug(
 3348                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3349                    )
 3350
 3351                    # Create Database
 3352                    database = Database(
 3353                        database=annotation,
 3354                        databases_folders=databases_folders,
 3355                        assembly=assembly,
 3356                    )
 3357
 3358                    # Find files
 3359                    db_file = database.get_database()
 3360                    db_file = full_path(db_file)
 3361                    db_hdr_file = database.get_header_file()
 3362                    db_hdr_file = full_path(db_hdr_file)
 3363                    db_file_type = database.get_format()
 3364
 3365                    # If db_file is http ?
 3366                    if database.get_database().startswith("http"):
 3367
 3368                        # Datbase is HTTP URL
 3369                        db_file_is_http = True
 3370
 3371                        # DB file keep as URL
 3372                        db_file = database.get_database()
 3373                        log.warning(f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)")
 3374
 3375                        # Retrieve automatic annotation field name
 3376                        annotation_field = clean_annotation_field(os.path.basename(db_file).replace(".bw", ""))
 3377                        log.debug(f"Create header file with annotation field '{annotation_field}' is an HTTP URL")
 3378
 3379                        # Create automatic header file
 3380                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3381                        with open(db_hdr_file, 'w') as f:
 3382                            f.write("##fileformat=VCFv4.2\n")
 3383                            f.write(f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""")
 3384                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3385
 3386                    else:
 3387
 3388                        # Datbase is NOT HTTP URL
 3389                        db_file_is_http = False
 3390                    
 3391
 3392                    # Check index - try to create if not exists
 3393                    if db_file is None or db_hdr_file is None or (not os.path.exists(db_file) and not db_file_is_http) or not os.path.exists(db_hdr_file) or not db_file_type in ["bw"]:
 3394                    #if False:
 3395                        log.error("Annotation failed: database not valid")
 3396                        log.error(f"Annotation annotation file: {db_file}")
 3397                        log.error(f"Annotation annotation file type: {db_file_type}")
 3398                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3399                        raise ValueError(
 3400                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3401                        )
 3402                    else:
 3403
 3404                        # Log
 3405                        log.debug(
 3406                            f"Annotation '{annotation}' - file: "
 3407                            + str(db_file)
 3408                            + " and "
 3409                            + str(db_hdr_file)
 3410                        )
 3411
 3412                        # Load header as VCF object
 3413                        db_hdr_vcf = Variants(input=db_hdr_file)
 3414                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3415                        log.debug(
 3416                            "Annotation database header: "
 3417                            + str(db_hdr_vcf_header_infos)
 3418                        )
 3419
 3420                        # For all fields in database
 3421                        annotation_fields_full = False
 3422                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3423                            annotation_fields = {
 3424                                key: key for key in db_hdr_vcf_header_infos
 3425                            }
 3426                            log.debug(
 3427                                "Annotation database header - All annotations added: "
 3428                                + str(annotation_fields)
 3429                            )
 3430                            annotation_fields_full = True
 3431
 3432                        # Init
 3433                        cyvcf2_header_rename_dict = {}
 3434                        cyvcf2_header_list = []
 3435                        cyvcf2_header_indexes = {}
 3436
 3437                        # process annotation fields
 3438                        for annotation_field in annotation_fields:
 3439
 3440                            # New annotation name 
 3441                            annotation_field_new = annotation_fields[annotation_field]
 3442
 3443                            # Check annotation field and index in header
 3444                            if annotation_field in db_hdr_vcf.get_header_columns_as_list():
 3445                                annotation_field_index = db_hdr_vcf.get_header_columns_as_list().index(annotation_field)-3
 3446                                cyvcf2_header_indexes[annotation_field_new] = annotation_field_index
 3447                            else:
 3448                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3449                                log.error(msg_err)
 3450                                raise ValueError(msg_err)
 3451
 3452                            # Append annotation field in cyvcf2 header list
 3453                            cyvcf2_header_rename_dict[annotation_field_new] = db_hdr_vcf_header_infos[annotation_field].id
 3454                            cyvcf2_header_list.append(
 3455                                {
 3456                                    "ID": annotation_field_new,
 3457                                    "Number": db_hdr_vcf_header_infos[annotation_field].num,
 3458                                    "Type": db_hdr_vcf_header_infos[annotation_field].type,
 3459                                    "Description": db_hdr_vcf_header_infos[annotation_field].desc,
 3460                                }
 3461                            )
 3462
 3463                        # Load bigwig database
 3464                        bw_db = pyBigWig.open(db_file)
 3465                        if bw_db.isBigWig():
 3466                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3467                        else:
 3468                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3469                            log.error(msg_err)
 3470                            raise ValueError(msg_err)
 3471
 3472                        annotation_bigwig_config_list.append(
 3473                            {
 3474                                "db_file": db_file,
 3475                                "bw_db": bw_db,
 3476                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3477                                "cyvcf2_header_list": cyvcf2_header_list,
 3478                                "cyvcf2_header_indexes": cyvcf2_header_indexes
 3479                            }
 3480                        )
 3481
 3482                # Annotate
 3483                if annotation_bigwig_config_list:
 3484
 3485                    # Annotation config
 3486                    log.debug(f"annotation_bigwig_config={annotation_bigwig_config_list}")
 3487
 3488                    # Export VCF file
 3489                    self.export_variant_vcf(
 3490                        vcf_file=tmp_vcf_name,
 3491                        remove_info=True,
 3492                        add_samples=False,
 3493                        index=True,
 3494                    )
 3495
 3496                    # Load input tmp file
 3497                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3498
 3499                    # Add header in input file
 3500                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3501                        for cyvcf2_header_field in annotation_bigwig_config.get("cyvcf2_header_list",[]):
 3502                            log.info(f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'")
 3503                            input_vcf.add_info_to_header(
 3504                                cyvcf2_header_field
 3505                            )
 3506
 3507                    # Create output VCF file
 3508                    output_vcf_file = os.path.join(tmp_dir,"output.vcf.gz")
 3509                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3510
 3511                    # Fetch variants
 3512                    log.info(f"Annotations 'bigwig' start...")
 3513                    for variant in input_vcf:
 3514
 3515                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3516
 3517                            # DB and indexes
 3518                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3519                            cyvcf2_header_indexes = annotation_bigwig_config.get("cyvcf2_header_indexes", None)
 3520
 3521                            # Retrieve value from chrom pos
 3522                            res = bw_db.values(variant.CHROM, variant.POS - 1, variant.POS)
 3523                            
 3524                            # For each annotation fields (and indexes)
 3525                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3526
 3527                                # If value is NOT nNone
 3528                                if not np.isnan(res[cyvcf2_header_indexes[cyvcf2_header_index]]):
 3529                                    variant.INFO[cyvcf2_header_index] = res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3530
 3531                        # Add record in output file
 3532                        output_vcf.write_record(variant)
 3533
 3534                    # Log
 3535                    log.debug(f"Annotation done.")
 3536
 3537                    # Close and write file
 3538                    log.info(f"Annotations 'bigwig' write...")
 3539                    output_vcf.close()
 3540                    log.debug(f"Write done.")
 3541
 3542                    # Update variants
 3543                    log.info(f"Annotations 'bigwig' update...")
 3544                    self.update_from_vcf(output_vcf_file)
 3545                    log.debug(f"Update done.")
 3546
 3547        return True
 3548
 3549
 3550    def annotation_snpsift(self, threads: int = None) -> None:
 3551        """
 3552        This function annotate with bcftools
 3553
 3554        :param threads: Number of threads to use
 3555        :return: the value of the variable "return_value".
 3556        """
 3557
 3558        # DEBUG
 3559        log.debug("Start annotation with bcftools databases")
 3560
 3561        # Threads
 3562        if not threads:
 3563            threads = self.get_threads()
 3564        log.debug("Threads: " + str(threads))
 3565
 3566        # Config
 3567        config = self.get_config()
 3568        log.debug("Config: " + str(config))
 3569
 3570        # Config - snpSift
 3571        snpsift_bin_command = get_bin_command(
 3572            bin="SnpSift.jar",
 3573            tool="snpsift",
 3574            bin_type="jar",
 3575            config=config,
 3576            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3577        )
 3578        if not snpsift_bin_command:
 3579            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3580            log.error(msg_err)
 3581            raise ValueError(msg_err)
 3582
 3583        # Config - bcftools
 3584        bcftools_bin_command = get_bin_command(
 3585            bin="bcftools",
 3586            tool="bcftools",
 3587            bin_type="bin",
 3588            config=config,
 3589            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3590        )
 3591        if not bcftools_bin_command:
 3592            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3593            log.error(msg_err)
 3594            raise ValueError(msg_err)
 3595
 3596        # Config - BCFTools databases folders
 3597        databases_folders = set(
 3598            self.get_config()
 3599            .get("folders", {})
 3600            .get("databases", {})
 3601            .get("annotations", ["."])
 3602            + self.get_config()
 3603            .get("folders", {})
 3604            .get("databases", {})
 3605            .get("bcftools", ["."])
 3606        )
 3607        log.debug("Databases annotations: " + str(databases_folders))
 3608
 3609        # Param
 3610        annotations = (
 3611            self.get_param()
 3612            .get("annotation", {})
 3613            .get("snpsift", {})
 3614            .get("annotations", None)
 3615        )
 3616        log.debug("Annotations: " + str(annotations))
 3617
 3618        # Assembly
 3619        assembly = self.get_param().get(
 3620            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3621        )
 3622
 3623        # Data
 3624        table_variants = self.get_table_variants()
 3625
 3626        # Check if not empty
 3627        log.debug("Check if not empty")
 3628        sql_query_chromosomes = (
 3629            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3630        )
 3631        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3632        if not sql_query_chromosomes_df["count"][0]:
 3633            log.info(f"VCF empty")
 3634            return
 3635
 3636        # VCF header
 3637        vcf_reader = self.get_header()
 3638        log.debug("Initial header: " + str(vcf_reader.infos))
 3639
 3640        # Existing annotations
 3641        for vcf_annotation in self.get_header().infos:
 3642
 3643            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3644            log.debug(
 3645                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3646            )
 3647
 3648        if annotations:
 3649
 3650            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3651
 3652                # Export VCF file
 3653                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3654
 3655                # Init
 3656                commands = {}
 3657
 3658                for annotation in annotations:
 3659                    annotation_fields = annotations[annotation]
 3660
 3661                    # Annotation Name
 3662                    annotation_name = os.path.basename(annotation)
 3663
 3664                    if not annotation_fields:
 3665                        annotation_fields = {"INFO": None}
 3666
 3667                    log.debug(f"Annotation '{annotation_name}'")
 3668                    log.debug(
 3669                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3670                    )
 3671
 3672                    # Create Database
 3673                    database = Database(
 3674                        database=annotation,
 3675                        databases_folders=databases_folders,
 3676                        assembly=assembly,
 3677                    )
 3678
 3679                    # Find files
 3680                    db_file = database.get_database()
 3681                    db_file = full_path(db_file)
 3682                    db_hdr_file = database.get_header_file()
 3683                    db_hdr_file = full_path(db_hdr_file)
 3684                    db_file_type = database.get_format()
 3685                    db_tbi_file = f"{db_file}.tbi"
 3686                    db_file_compressed = database.is_compressed()
 3687
 3688                    # Check if compressed
 3689                    if not db_file_compressed:
 3690                        log.error(
 3691                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3692                        )
 3693                        raise ValueError(
 3694                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3695                        )
 3696
 3697                    # Check if indexed
 3698                    if not os.path.exists(db_tbi_file):
 3699                        log.error(
 3700                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3701                        )
 3702                        raise ValueError(
 3703                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3704                        )
 3705
 3706                    # Check index - try to create if not exists
 3707                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3708                        log.error("Annotation failed: database not valid")
 3709                        log.error(f"Annotation annotation file: {db_file}")
 3710                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3711                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3712                        raise ValueError(
 3713                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3714                        )
 3715                    else:
 3716
 3717                        log.debug(
 3718                            f"Annotation '{annotation}' - file: "
 3719                            + str(db_file)
 3720                            + " and "
 3721                            + str(db_hdr_file)
 3722                        )
 3723
 3724                        # Load header as VCF object
 3725                        db_hdr_vcf = Variants(input=db_hdr_file)
 3726                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3727                        log.debug(
 3728                            "Annotation database header: "
 3729                            + str(db_hdr_vcf_header_infos)
 3730                        )
 3731
 3732                        # For all fields in database
 3733                        annotation_fields_full = False
 3734                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3735                            annotation_fields = {
 3736                                key: key for key in db_hdr_vcf_header_infos
 3737                            }
 3738                            log.debug(
 3739                                "Annotation database header - All annotations added: "
 3740                                + str(annotation_fields)
 3741                            )
 3742                            annotation_fields_full = True
 3743
 3744                        # # Create file for field rename
 3745                        # log.debug("Create file for field rename")
 3746                        # tmp_rename = NamedTemporaryFile(
 3747                        #     prefix=self.get_prefix(),
 3748                        #     dir=self.get_tmp_dir(),
 3749                        #     suffix=".rename",
 3750                        #     delete=False,
 3751                        # )
 3752                        # tmp_rename_name = tmp_rename.name
 3753                        # tmp_files.append(tmp_rename_name)
 3754
 3755                        # Number of fields
 3756                        nb_annotation_field = 0
 3757                        annotation_list = []
 3758                        annotation_infos_rename_list = []
 3759
 3760                        for annotation_field in annotation_fields:
 3761
 3762                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3763                            annotation_fields_new_name = annotation_fields.get(
 3764                                annotation_field, annotation_field
 3765                            )
 3766                            if not annotation_fields_new_name:
 3767                                annotation_fields_new_name = annotation_field
 3768
 3769                            # Check if field is in DB and if field is not elready in input data
 3770                            if (
 3771                                annotation_field in db_hdr_vcf.get_header().infos
 3772                                and annotation_fields_new_name
 3773                                not in self.get_header().infos
 3774                            ):
 3775
 3776                                log.info(
 3777                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3778                                )
 3779
 3780                                # BCFTools annotate param to rename fields
 3781                                if annotation_field != annotation_fields_new_name:
 3782                                    annotation_infos_rename_list.append(
 3783                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3784                                    )
 3785
 3786                                # Add INFO field to header
 3787                                db_hdr_vcf_header_infos_number = (
 3788                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3789                                )
 3790                                db_hdr_vcf_header_infos_type = (
 3791                                    db_hdr_vcf_header_infos[annotation_field].type
 3792                                    or "String"
 3793                                )
 3794                                db_hdr_vcf_header_infos_description = (
 3795                                    db_hdr_vcf_header_infos[annotation_field].desc
 3796                                    or f"{annotation_field} description"
 3797                                )
 3798                                db_hdr_vcf_header_infos_source = (
 3799                                    db_hdr_vcf_header_infos[annotation_field].source
 3800                                    or "unknown"
 3801                                )
 3802                                db_hdr_vcf_header_infos_version = (
 3803                                    db_hdr_vcf_header_infos[annotation_field].version
 3804                                    or "unknown"
 3805                                )
 3806
 3807                                vcf_reader.infos[annotation_fields_new_name] = (
 3808                                    vcf.parser._Info(
 3809                                        annotation_fields_new_name,
 3810                                        db_hdr_vcf_header_infos_number,
 3811                                        db_hdr_vcf_header_infos_type,
 3812                                        db_hdr_vcf_header_infos_description,
 3813                                        db_hdr_vcf_header_infos_source,
 3814                                        db_hdr_vcf_header_infos_version,
 3815                                        self.code_type_map[
 3816                                            db_hdr_vcf_header_infos_type
 3817                                        ],
 3818                                    )
 3819                                )
 3820
 3821                                annotation_list.append(annotation_field)
 3822
 3823                                nb_annotation_field += 1
 3824
 3825                            else:
 3826
 3827                                if (
 3828                                    annotation_field
 3829                                    not in db_hdr_vcf.get_header().infos
 3830                                ):
 3831                                    log.warning(
 3832                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3833                                    )
 3834                                if (
 3835                                    annotation_fields_new_name
 3836                                    in self.get_header().infos
 3837                                ):
 3838                                    log.warning(
 3839                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3840                                    )
 3841
 3842                        log.info(
 3843                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3844                        )
 3845
 3846                        annotation_infos = ",".join(annotation_list)
 3847
 3848                        if annotation_infos != "":
 3849
 3850                            # Annotated VCF (and error file)
 3851                            tmp_annotation_vcf_name = os.path.join(
 3852                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3853                            )
 3854                            tmp_annotation_vcf_name_err = (
 3855                                tmp_annotation_vcf_name + ".err"
 3856                            )
 3857
 3858                            # Add fields to annotate
 3859                            if not annotation_fields_full:
 3860                                annotation_infos_option = f"-info {annotation_infos}"
 3861                            else:
 3862                                annotation_infos_option = ""
 3863
 3864                            # Info fields rename
 3865                            if annotation_infos_rename_list:
 3866                                annotation_infos_rename = " -c " + ",".join(
 3867                                    annotation_infos_rename_list
 3868                                )
 3869                            else:
 3870                                annotation_infos_rename = ""
 3871
 3872                            # Annotate command
 3873                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3874
 3875                            # Add command
 3876                            commands[command_annotate] = tmp_annotation_vcf_name
 3877
 3878                if commands:
 3879
 3880                    # Export VCF file
 3881                    self.export_variant_vcf(
 3882                        vcf_file=tmp_vcf_name,
 3883                        remove_info=True,
 3884                        add_samples=False,
 3885                        index=True,
 3886                    )
 3887                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3888
 3889                    # Num command
 3890                    nb_command = 0
 3891
 3892                    # Annotate
 3893                    for command_annotate in commands:
 3894                        nb_command += 1
 3895                        log.info(
 3896                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3897                        )
 3898                        log.debug(f"command_annotate={command_annotate}")
 3899                        run_parallel_commands([command_annotate], threads)
 3900
 3901                        # Debug
 3902                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3903
 3904                        # Update variants
 3905                        log.info(
 3906                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3907                        )
 3908                        self.update_from_vcf(commands[command_annotate])
 3909
 3910
 3911    def annotation_bcftools(self, threads: int = None) -> None:
 3912        """
 3913        This function annotate with bcftools
 3914
 3915        :param threads: Number of threads to use
 3916        :return: the value of the variable "return_value".
 3917        """
 3918
 3919        # DEBUG
 3920        log.debug("Start annotation with bcftools databases")
 3921
 3922        # Threads
 3923        if not threads:
 3924            threads = self.get_threads()
 3925        log.debug("Threads: " + str(threads))
 3926
 3927        # Config
 3928        config = self.get_config()
 3929        log.debug("Config: " + str(config))
 3930
 3931        # DEBUG
 3932        delete_tmp = True
 3933        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3934            delete_tmp = False
 3935            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3936
 3937        # Config - BCFTools bin command
 3938        bcftools_bin_command = get_bin_command(
 3939            bin="bcftools",
 3940            tool="bcftools",
 3941            bin_type="bin",
 3942            config=config,
 3943            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3944        )
 3945        if not bcftools_bin_command:
 3946            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3947            log.error(msg_err)
 3948            raise ValueError(msg_err)
 3949
 3950        # Config - BCFTools databases folders
 3951        databases_folders = set(
 3952            self.get_config()
 3953            .get("folders", {})
 3954            .get("databases", {})
 3955            .get("annotations", ["."])
 3956            + self.get_config()
 3957            .get("folders", {})
 3958            .get("databases", {})
 3959            .get("bcftools", ["."])
 3960        )
 3961        log.debug("Databases annotations: " + str(databases_folders))
 3962
 3963        # Param
 3964        annotations = (
 3965            self.get_param()
 3966            .get("annotation", {})
 3967            .get("bcftools", {})
 3968            .get("annotations", None)
 3969        )
 3970        log.debug("Annotations: " + str(annotations))
 3971
 3972        # Assembly
 3973        assembly = self.get_param().get(
 3974            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3975        )
 3976
 3977        # Data
 3978        table_variants = self.get_table_variants()
 3979
 3980        # Check if not empty
 3981        log.debug("Check if not empty")
 3982        sql_query_chromosomes = (
 3983            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3984        )
 3985        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3986        if not sql_query_chromosomes_df["count"][0]:
 3987            log.info(f"VCF empty")
 3988            return
 3989
 3990        # Export in VCF
 3991        log.debug("Create initial file to annotate")
 3992        tmp_vcf = NamedTemporaryFile(
 3993            prefix=self.get_prefix(),
 3994            dir=self.get_tmp_dir(),
 3995            suffix=".vcf.gz",
 3996            delete=False,
 3997        )
 3998        tmp_vcf_name = tmp_vcf.name
 3999
 4000        # VCF header
 4001        vcf_reader = self.get_header()
 4002        log.debug("Initial header: " + str(vcf_reader.infos))
 4003
 4004        # Existing annotations
 4005        for vcf_annotation in self.get_header().infos:
 4006
 4007            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4008            log.debug(
 4009                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4010            )
 4011
 4012        if annotations:
 4013
 4014            tmp_ann_vcf_list = []
 4015            commands = []
 4016            tmp_files = []
 4017            err_files = []
 4018
 4019            for annotation in annotations:
 4020                annotation_fields = annotations[annotation]
 4021
 4022                # Annotation Name
 4023                annotation_name = os.path.basename(annotation)
 4024
 4025                if not annotation_fields:
 4026                    annotation_fields = {"INFO": None}
 4027
 4028                log.debug(f"Annotation '{annotation_name}'")
 4029                log.debug(
 4030                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4031                )
 4032
 4033                # Create Database
 4034                database = Database(
 4035                    database=annotation,
 4036                    databases_folders=databases_folders,
 4037                    assembly=assembly,
 4038                )
 4039
 4040                # Find files
 4041                db_file = database.get_database()
 4042                db_file = full_path(db_file)
 4043                db_hdr_file = database.get_header_file()
 4044                db_hdr_file = full_path(db_hdr_file)
 4045                db_file_type = database.get_format()
 4046                db_tbi_file = f"{db_file}.tbi"
 4047                db_file_compressed = database.is_compressed()
 4048
 4049                # Check if compressed
 4050                if not db_file_compressed:
 4051                    log.error(
 4052                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4053                    )
 4054                    raise ValueError(
 4055                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4056                    )
 4057
 4058                # Check if indexed
 4059                if not os.path.exists(db_tbi_file):
 4060                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4061                    raise ValueError(
 4062                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4063                    )
 4064
 4065                # Check index - try to create if not exists
 4066                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4067                    log.error("Annotation failed: database not valid")
 4068                    log.error(f"Annotation annotation file: {db_file}")
 4069                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4070                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4071                    raise ValueError(
 4072                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4073                    )
 4074                else:
 4075
 4076                    log.debug(
 4077                        f"Annotation '{annotation}' - file: "
 4078                        + str(db_file)
 4079                        + " and "
 4080                        + str(db_hdr_file)
 4081                    )
 4082
 4083                    # Load header as VCF object
 4084                    db_hdr_vcf = Variants(input=db_hdr_file)
 4085                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4086                    log.debug(
 4087                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4088                    )
 4089
 4090                    # For all fields in database
 4091                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4092                        annotation_fields = {
 4093                            key: key for key in db_hdr_vcf_header_infos
 4094                        }
 4095                        log.debug(
 4096                            "Annotation database header - All annotations added: "
 4097                            + str(annotation_fields)
 4098                        )
 4099
 4100                    # Number of fields
 4101                    nb_annotation_field = 0
 4102                    annotation_list = []
 4103
 4104                    for annotation_field in annotation_fields:
 4105
 4106                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4107                        annotation_fields_new_name = annotation_fields.get(
 4108                            annotation_field, annotation_field
 4109                        )
 4110                        if not annotation_fields_new_name:
 4111                            annotation_fields_new_name = annotation_field
 4112
 4113                        # Check if field is in DB and if field is not elready in input data
 4114                        if (
 4115                            annotation_field in db_hdr_vcf.get_header().infos
 4116                            and annotation_fields_new_name
 4117                            not in self.get_header().infos
 4118                        ):
 4119
 4120                            log.info(
 4121                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4122                            )
 4123
 4124                            # Add INFO field to header
 4125                            db_hdr_vcf_header_infos_number = (
 4126                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4127                            )
 4128                            db_hdr_vcf_header_infos_type = (
 4129                                db_hdr_vcf_header_infos[annotation_field].type
 4130                                or "String"
 4131                            )
 4132                            db_hdr_vcf_header_infos_description = (
 4133                                db_hdr_vcf_header_infos[annotation_field].desc
 4134                                or f"{annotation_field} description"
 4135                            )
 4136                            db_hdr_vcf_header_infos_source = (
 4137                                db_hdr_vcf_header_infos[annotation_field].source
 4138                                or "unknown"
 4139                            )
 4140                            db_hdr_vcf_header_infos_version = (
 4141                                db_hdr_vcf_header_infos[annotation_field].version
 4142                                or "unknown"
 4143                            )
 4144
 4145                            vcf_reader.infos[annotation_fields_new_name] = (
 4146                                vcf.parser._Info(
 4147                                    annotation_fields_new_name,
 4148                                    db_hdr_vcf_header_infos_number,
 4149                                    db_hdr_vcf_header_infos_type,
 4150                                    db_hdr_vcf_header_infos_description,
 4151                                    db_hdr_vcf_header_infos_source,
 4152                                    db_hdr_vcf_header_infos_version,
 4153                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4154                                )
 4155                            )
 4156
 4157                            # annotation_list.append(annotation_field)
 4158                            if annotation_field != annotation_fields_new_name:
 4159                                annotation_list.append(
 4160                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4161                                )
 4162                            else:
 4163                                annotation_list.append(annotation_field)
 4164
 4165                            nb_annotation_field += 1
 4166
 4167                        else:
 4168
 4169                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4170                                log.warning(
 4171                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4172                                )
 4173                            if annotation_fields_new_name in self.get_header().infos:
 4174                                log.warning(
 4175                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4176                                )
 4177
 4178                    log.info(
 4179                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4180                    )
 4181
 4182                    annotation_infos = ",".join(annotation_list)
 4183
 4184                    if annotation_infos != "":
 4185
 4186                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4187                        log.debug("Protect Header file - remove #CHROM line if exists")
 4188                        tmp_header_vcf = NamedTemporaryFile(
 4189                            prefix=self.get_prefix(),
 4190                            dir=self.get_tmp_dir(),
 4191                            suffix=".hdr",
 4192                            delete=False,
 4193                        )
 4194                        tmp_header_vcf_name = tmp_header_vcf.name
 4195                        tmp_files.append(tmp_header_vcf_name)
 4196                        # Command
 4197                        if db_hdr_file.endswith(".gz"):
 4198                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4199                        else:
 4200                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4201                        # Run
 4202                        run_parallel_commands([command_extract_header], 1)
 4203
 4204                        # Find chomosomes
 4205                        log.debug("Find chromosomes ")
 4206                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4207                        sql_query_chromosomes_df = self.get_query_to_df(
 4208                            sql_query_chromosomes
 4209                        )
 4210                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4211
 4212                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4213
 4214                        # BED columns in the annotation file
 4215                        if db_file_type in ["bed"]:
 4216                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4217
 4218                        for chrom in chomosomes_list:
 4219
 4220                            # Create BED on initial VCF
 4221                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4222                            tmp_bed = NamedTemporaryFile(
 4223                                prefix=self.get_prefix(),
 4224                                dir=self.get_tmp_dir(),
 4225                                suffix=".bed",
 4226                                delete=False,
 4227                            )
 4228                            tmp_bed_name = tmp_bed.name
 4229                            tmp_files.append(tmp_bed_name)
 4230
 4231                            # Detecte regions
 4232                            log.debug(
 4233                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4234                            )
 4235                            window = 1000000
 4236                            sql_query_intervals_for_bed = f"""
 4237                                SELECT  \"#CHROM\",
 4238                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4239                                        \"POS\"+{window}
 4240                                FROM {table_variants} as table_variants
 4241                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4242                            """
 4243                            regions = self.conn.execute(
 4244                                sql_query_intervals_for_bed
 4245                            ).fetchall()
 4246                            merged_regions = merge_regions(regions)
 4247                            log.debug(
 4248                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4249                            )
 4250
 4251                            header = ["#CHROM", "START", "END"]
 4252                            with open(tmp_bed_name, "w") as f:
 4253                                # Write the header with tab delimiter
 4254                                f.write("\t".join(header) + "\n")
 4255                                for d in merged_regions:
 4256                                    # Write each data row with tab delimiter
 4257                                    f.write("\t".join(map(str, d)) + "\n")
 4258
 4259                            # Tmp files
 4260                            tmp_annotation_vcf = NamedTemporaryFile(
 4261                                prefix=self.get_prefix(),
 4262                                dir=self.get_tmp_dir(),
 4263                                suffix=".vcf.gz",
 4264                                delete=False,
 4265                            )
 4266                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4267                            tmp_files.append(tmp_annotation_vcf_name)
 4268                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4269                            tmp_annotation_vcf_name_err = (
 4270                                tmp_annotation_vcf_name + ".err"
 4271                            )
 4272                            err_files.append(tmp_annotation_vcf_name_err)
 4273
 4274                            # Annotate Command
 4275                            log.debug(
 4276                                f"Annotation '{annotation}' - add bcftools command"
 4277                            )
 4278
 4279                            # Command
 4280                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4281
 4282                            # Add command
 4283                            commands.append(command_annotate)
 4284
 4285            # if some commands
 4286            if commands:
 4287
 4288                # Export VCF file
 4289                self.export_variant_vcf(
 4290                    vcf_file=tmp_vcf_name,
 4291                    remove_info=True,
 4292                    add_samples=False,
 4293                    index=True,
 4294                )
 4295
 4296                # Threads
 4297                # calculate threads for annotated commands
 4298                if commands:
 4299                    threads_bcftools_annotate = round(threads / len(commands))
 4300                else:
 4301                    threads_bcftools_annotate = 1
 4302
 4303                if not threads_bcftools_annotate:
 4304                    threads_bcftools_annotate = 1
 4305
 4306                # Add threads option to bcftools commands
 4307                if threads_bcftools_annotate > 1:
 4308                    commands_threaded = []
 4309                    for command in commands:
 4310                        commands_threaded.append(
 4311                            command.replace(
 4312                                f"{bcftools_bin_command} annotate ",
 4313                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4314                            )
 4315                        )
 4316                    commands = commands_threaded
 4317
 4318                # Command annotation multithreading
 4319                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4320                log.info(
 4321                    f"Annotation - Annotation multithreaded in "
 4322                    + str(len(commands))
 4323                    + " commands"
 4324                )
 4325
 4326                run_parallel_commands(commands, threads)
 4327
 4328                # Merge
 4329                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4330
 4331                if tmp_ann_vcf_list_cmd:
 4332
 4333                    # Tmp file
 4334                    tmp_annotate_vcf = NamedTemporaryFile(
 4335                        prefix=self.get_prefix(),
 4336                        dir=self.get_tmp_dir(),
 4337                        suffix=".vcf.gz",
 4338                        delete=True,
 4339                    )
 4340                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4341                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4342                    err_files.append(tmp_annotate_vcf_name_err)
 4343
 4344                    # Tmp file remove command
 4345                    tmp_files_remove_command = ""
 4346                    if tmp_files:
 4347                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4348
 4349                    # Command merge
 4350                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4351                    log.info(
 4352                        f"Annotation - Annotation merging "
 4353                        + str(len(commands))
 4354                        + " annotated files"
 4355                    )
 4356                    log.debug(f"Annotation - merge command: {merge_command}")
 4357                    run_parallel_commands([merge_command], 1)
 4358
 4359                    # Error messages
 4360                    log.info(f"Error/Warning messages:")
 4361                    error_message_command_all = []
 4362                    error_message_command_warning = []
 4363                    error_message_command_err = []
 4364                    for err_file in err_files:
 4365                        with open(err_file, "r") as f:
 4366                            for line in f:
 4367                                message = line.strip()
 4368                                error_message_command_all.append(message)
 4369                                if line.startswith("[W::"):
 4370                                    error_message_command_warning.append(message)
 4371                                if line.startswith("[E::"):
 4372                                    error_message_command_err.append(
 4373                                        f"{err_file}: " + message
 4374                                    )
 4375                    # log info
 4376                    for message in list(
 4377                        set(error_message_command_err + error_message_command_warning)
 4378                    ):
 4379                        log.info(f"   {message}")
 4380                    # debug info
 4381                    for message in list(set(error_message_command_all)):
 4382                        log.debug(f"   {message}")
 4383                    # failed
 4384                    if len(error_message_command_err):
 4385                        log.error("Annotation failed: Error in commands")
 4386                        raise ValueError("Annotation failed: Error in commands")
 4387
 4388                    # Update variants
 4389                    log.info(f"Annotation - Updating...")
 4390                    self.update_from_vcf(tmp_annotate_vcf_name)
 4391
 4392    def annotation_exomiser(self, threads: int = None) -> None:
 4393        """
 4394        This function annotate with Exomiser
 4395
 4396        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4397        - "analysis" (dict/file):
 4398            Full analysis dictionnary parameters (see Exomiser docs).
 4399            Either a dict, or a file in JSON or YAML format.
 4400            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4401            Default : None
 4402        - "preset" (string):
 4403            Analysis preset (available in config folder).
 4404            Used if no full "analysis" is provided.
 4405            Default: "exome"
 4406        - "phenopacket" (dict/file):
 4407            Samples and phenotipic features parameters (see Exomiser docs).
 4408            Either a dict, or a file in JSON or YAML format.
 4409            Default: None
 4410        - "subject" (dict):
 4411            Sample parameters (see Exomiser docs).
 4412            Example:
 4413                "subject":
 4414                    {
 4415                        "id": "ISDBM322017",
 4416                        "sex": "FEMALE"
 4417                    }
 4418            Default: None
 4419        - "sample" (string):
 4420            Sample name to construct "subject" section:
 4421                "subject":
 4422                    {
 4423                        "id": "<sample>",
 4424                        "sex": "UNKNOWN_SEX"
 4425                    }
 4426            Default: None
 4427        - "phenotypicFeatures" (dict)
 4428            Phenotypic features to construct "subject" section.
 4429            Example:
 4430                "phenotypicFeatures":
 4431                    [
 4432                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4433                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4434                    ]
 4435        - "hpo" (list)
 4436            List of HPO ids as phenotypic features.
 4437            Example:
 4438                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4439            Default: []
 4440        - "outputOptions" (dict):
 4441            Output options (see Exomiser docs).
 4442            Default:
 4443                "output_options" =
 4444                    {
 4445                        "outputContributingVariantsOnly": False,
 4446                        "numGenes": 0,
 4447                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4448                    }
 4449        - "transcript_source" (string):
 4450            Transcript source (either "refseq", "ucsc", "ensembl")
 4451            Default: "refseq"
 4452        - "exomiser_to_info" (boolean):
 4453            Add exomiser TSV file columns as INFO fields in VCF.
 4454            Default: False
 4455        - "release" (string):
 4456            Exomise database release.
 4457            If not exists, database release will be downloaded (take a while).
 4458            Default: None (provided by application.properties configuration file)
 4459        - "exomiser_application_properties" (file):
 4460            Exomiser configuration file (see Exomiser docs).
 4461            Useful to automatically download databases (especially for specific genome databases).
 4462
 4463        Notes:
 4464        - If no sample in parameters, first sample in VCF will be chosen
 4465        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4466
 4467        :param threads: The number of threads to use
 4468        :return: None.
 4469        """
 4470
 4471        # DEBUG
 4472        log.debug("Start annotation with Exomiser databases")
 4473
 4474        # Threads
 4475        if not threads:
 4476            threads = self.get_threads()
 4477        log.debug("Threads: " + str(threads))
 4478
 4479        # Config
 4480        config = self.get_config()
 4481        log.debug("Config: " + str(config))
 4482
 4483        # Config - Folders - Databases
 4484        databases_folders = (
 4485            config.get("folders", {})
 4486            .get("databases", {})
 4487            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4488        )
 4489        databases_folders = full_path(databases_folders)
 4490        if not os.path.exists(databases_folders):
 4491            log.error(f"Databases annotations: {databases_folders} NOT found")
 4492        log.debug("Databases annotations: " + str(databases_folders))
 4493
 4494        # Config - Exomiser
 4495        exomiser_bin_command = get_bin_command(
 4496            bin="exomiser-cli*.jar",
 4497            tool="exomiser",
 4498            bin_type="jar",
 4499            config=config,
 4500            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4501        )
 4502        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4503        if not exomiser_bin_command:
 4504            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4505            log.error(msg_err)
 4506            raise ValueError(msg_err)
 4507
 4508        # Param
 4509        param = self.get_param()
 4510        log.debug("Param: " + str(param))
 4511
 4512        # Param - Exomiser
 4513        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4514        log.debug(f"Param Exomiser: {param_exomiser}")
 4515
 4516        # Param - Assembly
 4517        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4518        log.debug("Assembly: " + str(assembly))
 4519
 4520        # Data
 4521        table_variants = self.get_table_variants()
 4522
 4523        # Check if not empty
 4524        log.debug("Check if not empty")
 4525        sql_query_chromosomes = (
 4526            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4527        )
 4528        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4529            log.info(f"VCF empty")
 4530            return False
 4531
 4532        # VCF header
 4533        vcf_reader = self.get_header()
 4534        log.debug("Initial header: " + str(vcf_reader.infos))
 4535
 4536        # Samples
 4537        samples = self.get_header_sample_list()
 4538        if not samples:
 4539            log.error("No Samples in VCF")
 4540            return False
 4541        log.debug(f"Samples: {samples}")
 4542
 4543        # Memory limit
 4544        memory_limit = self.get_memory("8G")
 4545        log.debug(f"memory_limit: {memory_limit}")
 4546
 4547        # Exomiser java options
 4548        exomiser_java_options = (
 4549            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4550        )
 4551        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4552
 4553        # Download Exomiser (if not exists)
 4554        exomiser_release = param_exomiser.get("release", None)
 4555        exomiser_application_properties = param_exomiser.get(
 4556            "exomiser_application_properties", None
 4557        )
 4558        databases_download_exomiser(
 4559            assemblies=[assembly],
 4560            exomiser_folder=databases_folders,
 4561            exomiser_release=exomiser_release,
 4562            exomiser_phenotype_release=exomiser_release,
 4563            exomiser_application_properties=exomiser_application_properties,
 4564        )
 4565
 4566        # Force annotation
 4567        force_update_annotation = True
 4568
 4569        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4570            log.debug("Start annotation Exomiser")
 4571
 4572            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4573
 4574                # tmp_dir = "/tmp/exomiser"
 4575
 4576                ### ANALYSIS ###
 4577                ################
 4578
 4579                # Create analysis.json through analysis dict
 4580                # either analysis in param or by default
 4581                # depending on preset exome/genome)
 4582
 4583                # Init analysis dict
 4584                param_exomiser_analysis_dict = {}
 4585
 4586                # analysis from param
 4587                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4588                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4589
 4590                # If analysis in param -> load anlaysis json
 4591                if param_exomiser_analysis:
 4592
 4593                    # If param analysis is a file and exists
 4594                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4595                        param_exomiser_analysis
 4596                    ):
 4597                        # Load analysis file into analysis dict (either yaml or json)
 4598                        with open(param_exomiser_analysis) as json_file:
 4599                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4600
 4601                    # If param analysis is a dict
 4602                    elif isinstance(param_exomiser_analysis, dict):
 4603                        # Load analysis dict into analysis dict (either yaml or json)
 4604                        param_exomiser_analysis_dict = param_exomiser_analysis
 4605
 4606                    # Error analysis type
 4607                    else:
 4608                        log.error(f"Analysis type unknown. Check param file.")
 4609                        raise ValueError(f"Analysis type unknown. Check param file.")
 4610
 4611                # Case no input analysis config file/dict
 4612                # Use preset (exome/genome) to open default config file
 4613                if not param_exomiser_analysis_dict:
 4614
 4615                    # default preset
 4616                    default_preset = "exome"
 4617
 4618                    # Get param preset or default preset
 4619                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4620
 4621                    # Try to find if preset is a file
 4622                    if os.path.exists(param_exomiser_preset):
 4623                        # Preset file is provided in full path
 4624                        param_exomiser_analysis_default_config_file = (
 4625                            param_exomiser_preset
 4626                        )
 4627                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4628                    #     # Preset file is provided in full path
 4629                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4630                    elif os.path.exists(
 4631                        os.path.join(folder_config, param_exomiser_preset)
 4632                    ):
 4633                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4634                        param_exomiser_analysis_default_config_file = os.path.join(
 4635                            folder_config, param_exomiser_preset
 4636                        )
 4637                    else:
 4638                        # Construct preset file
 4639                        param_exomiser_analysis_default_config_file = os.path.join(
 4640                            folder_config,
 4641                            f"preset-{param_exomiser_preset}-analysis.json",
 4642                        )
 4643
 4644                    # If preset file exists
 4645                    param_exomiser_analysis_default_config_file = full_path(
 4646                        param_exomiser_analysis_default_config_file
 4647                    )
 4648                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4649                        # Load prest file into analysis dict (either yaml or json)
 4650                        with open(
 4651                            param_exomiser_analysis_default_config_file
 4652                        ) as json_file:
 4653                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4654                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4655                                json_file
 4656                            )
 4657
 4658                    # Error preset file
 4659                    else:
 4660                        log.error(
 4661                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4662                        )
 4663                        raise ValueError(
 4664                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4665                        )
 4666
 4667                # If no analysis dict created
 4668                if not param_exomiser_analysis_dict:
 4669                    log.error(f"No analysis config")
 4670                    raise ValueError(f"No analysis config")
 4671
 4672                # Log
 4673                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4674
 4675                ### PHENOPACKET ###
 4676                ###################
 4677
 4678                # If no PhenoPacket in analysis dict -> check in param
 4679                if "phenopacket" not in param_exomiser_analysis_dict:
 4680
 4681                    # If PhenoPacket in param -> load anlaysis json
 4682                    if param_exomiser.get("phenopacket", None):
 4683
 4684                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4685                        param_exomiser_phenopacket = full_path(
 4686                            param_exomiser_phenopacket
 4687                        )
 4688
 4689                        # If param phenopacket is a file and exists
 4690                        if isinstance(
 4691                            param_exomiser_phenopacket, str
 4692                        ) and os.path.exists(param_exomiser_phenopacket):
 4693                            # Load phenopacket file into analysis dict (either yaml or json)
 4694                            with open(param_exomiser_phenopacket) as json_file:
 4695                                param_exomiser_analysis_dict["phenopacket"] = (
 4696                                    yaml.safe_load(json_file)
 4697                                )
 4698
 4699                        # If param phenopacket is a dict
 4700                        elif isinstance(param_exomiser_phenopacket, dict):
 4701                            # Load phenopacket dict into analysis dict (either yaml or json)
 4702                            param_exomiser_analysis_dict["phenopacket"] = (
 4703                                param_exomiser_phenopacket
 4704                            )
 4705
 4706                        # Error phenopacket type
 4707                        else:
 4708                            log.error(f"Phenopacket type unknown. Check param file.")
 4709                            raise ValueError(
 4710                                f"Phenopacket type unknown. Check param file."
 4711                            )
 4712
 4713                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4714                if "phenopacket" not in param_exomiser_analysis_dict:
 4715
 4716                    # Init PhenoPacket
 4717                    param_exomiser_analysis_dict["phenopacket"] = {
 4718                        "id": "analysis",
 4719                        "proband": {},
 4720                    }
 4721
 4722                    ### Add subject ###
 4723
 4724                    # If subject exists
 4725                    param_exomiser_subject = param_exomiser.get("subject", {})
 4726
 4727                    # If subject not exists -> found sample ID
 4728                    if not param_exomiser_subject:
 4729
 4730                        # Found sample ID in param
 4731                        sample = param_exomiser.get("sample", None)
 4732
 4733                        # Find sample ID (first sample)
 4734                        if not sample:
 4735                            sample_list = self.get_header_sample_list()
 4736                            if len(sample_list) > 0:
 4737                                sample = sample_list[0]
 4738                            else:
 4739                                log.error(f"No sample found")
 4740                                raise ValueError(f"No sample found")
 4741
 4742                        # Create subject
 4743                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4744
 4745                    # Add to dict
 4746                    param_exomiser_analysis_dict["phenopacket"][
 4747                        "subject"
 4748                    ] = param_exomiser_subject
 4749
 4750                    ### Add "phenotypicFeatures" ###
 4751
 4752                    # If phenotypicFeatures exists
 4753                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4754                        "phenotypicFeatures", []
 4755                    )
 4756
 4757                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4758                    if not param_exomiser_phenotypicfeatures:
 4759
 4760                        # Found HPO in param
 4761                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4762
 4763                        # Split HPO if list in string format separated by comma
 4764                        if isinstance(param_exomiser_hpo, str):
 4765                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4766
 4767                        # Create HPO list
 4768                        for hpo in param_exomiser_hpo:
 4769                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4770                            param_exomiser_phenotypicfeatures.append(
 4771                                {
 4772                                    "type": {
 4773                                        "id": f"HP:{hpo_clean}",
 4774                                        "label": f"HP:{hpo_clean}",
 4775                                    }
 4776                                }
 4777                            )
 4778
 4779                    # Add to dict
 4780                    param_exomiser_analysis_dict["phenopacket"][
 4781                        "phenotypicFeatures"
 4782                    ] = param_exomiser_phenotypicfeatures
 4783
 4784                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4785                    if not param_exomiser_phenotypicfeatures:
 4786                        for step in param_exomiser_analysis_dict.get(
 4787                            "analysis", {}
 4788                        ).get("steps", []):
 4789                            if "hiPhivePrioritiser" in step:
 4790                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4791                                    "steps", []
 4792                                ).remove(step)
 4793
 4794                ### Add Input File ###
 4795
 4796                # Initial file name and htsFiles
 4797                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4798                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4799                    {
 4800                        "uri": tmp_vcf_name,
 4801                        "htsFormat": "VCF",
 4802                        "genomeAssembly": assembly,
 4803                    }
 4804                ]
 4805
 4806                ### Add metaData ###
 4807
 4808                # If metaData not in analysis dict
 4809                if "metaData" not in param_exomiser_analysis_dict:
 4810                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4811                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4812                        "createdBy": "howard",
 4813                        "phenopacketSchemaVersion": 1,
 4814                    }
 4815
 4816                ### OutputOptions ###
 4817
 4818                # Init output result folder
 4819                output_results = os.path.join(tmp_dir, "results")
 4820
 4821                # If no outputOptions in analysis dict
 4822                if "outputOptions" not in param_exomiser_analysis_dict:
 4823
 4824                    # default output formats
 4825                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4826
 4827                    # Get outputOptions in param
 4828                    output_options = param_exomiser.get("outputOptions", None)
 4829
 4830                    # If no output_options in param -> check
 4831                    if not output_options:
 4832                        output_options = {
 4833                            "outputContributingVariantsOnly": False,
 4834                            "numGenes": 0,
 4835                            "outputFormats": defaut_output_formats,
 4836                        }
 4837
 4838                    # Replace outputDirectory in output options
 4839                    output_options["outputDirectory"] = output_results
 4840                    output_options["outputFileName"] = "howard"
 4841
 4842                    # Add outputOptions in analysis dict
 4843                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4844
 4845                else:
 4846
 4847                    # Replace output_results and output format (if exists in param)
 4848                    param_exomiser_analysis_dict["outputOptions"][
 4849                        "outputDirectory"
 4850                    ] = output_results
 4851                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4852                        list(
 4853                            set(
 4854                                param_exomiser_analysis_dict.get(
 4855                                    "outputOptions", {}
 4856                                ).get("outputFormats", [])
 4857                                + ["TSV_VARIANT", "VCF"]
 4858                            )
 4859                        )
 4860                    )
 4861
 4862                # log
 4863                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4864
 4865                ### ANALYSIS FILE ###
 4866                #####################
 4867
 4868                ### Full JSON analysis config file ###
 4869
 4870                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4871                with open(exomiser_analysis, "w") as fp:
 4872                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4873
 4874                ### SPLIT analysis and sample config files
 4875
 4876                # Splitted analysis dict
 4877                param_exomiser_analysis_dict_for_split = (
 4878                    param_exomiser_analysis_dict.copy()
 4879                )
 4880
 4881                # Phenopacket JSON file
 4882                exomiser_analysis_phenopacket = os.path.join(
 4883                    tmp_dir, "analysis_phenopacket.json"
 4884                )
 4885                with open(exomiser_analysis_phenopacket, "w") as fp:
 4886                    json.dump(
 4887                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4888                        fp,
 4889                        indent=4,
 4890                    )
 4891
 4892                # Analysis JSON file without Phenopacket parameters
 4893                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4894                exomiser_analysis_analysis = os.path.join(
 4895                    tmp_dir, "analysis_analysis.json"
 4896                )
 4897                with open(exomiser_analysis_analysis, "w") as fp:
 4898                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4899
 4900                ### INITAL VCF file ###
 4901                #######################
 4902
 4903                ### Create list of samples to use and include inti initial VCF file ####
 4904
 4905                # Subject (main sample)
 4906                # Get sample ID in analysis dict
 4907                sample_subject = (
 4908                    param_exomiser_analysis_dict.get("phenopacket", {})
 4909                    .get("subject", {})
 4910                    .get("id", None)
 4911                )
 4912                sample_proband = (
 4913                    param_exomiser_analysis_dict.get("phenopacket", {})
 4914                    .get("proband", {})
 4915                    .get("subject", {})
 4916                    .get("id", None)
 4917                )
 4918                sample = []
 4919                if sample_subject:
 4920                    sample.append(sample_subject)
 4921                if sample_proband:
 4922                    sample.append(sample_proband)
 4923
 4924                # Get sample ID within Pedigree
 4925                pedigree_persons_list = (
 4926                    param_exomiser_analysis_dict.get("phenopacket", {})
 4927                    .get("pedigree", {})
 4928                    .get("persons", {})
 4929                )
 4930
 4931                # Create list with all sample ID in pedigree (if exists)
 4932                pedigree_persons = []
 4933                for person in pedigree_persons_list:
 4934                    pedigree_persons.append(person.get("individualId"))
 4935
 4936                # Concat subject sample ID and samples ID in pedigreesamples
 4937                samples = list(set(sample + pedigree_persons))
 4938
 4939                # Check if sample list is not empty
 4940                if not samples:
 4941                    log.error(f"No samples found")
 4942                    raise ValueError(f"No samples found")
 4943
 4944                # Create VCF with sample (either sample in param or first one by default)
 4945                # Export VCF file
 4946                self.export_variant_vcf(
 4947                    vcf_file=tmp_vcf_name,
 4948                    remove_info=True,
 4949                    add_samples=True,
 4950                    list_samples=samples,
 4951                    index=False,
 4952                )
 4953
 4954                ### Execute Exomiser ###
 4955                ########################
 4956
 4957                # Init command
 4958                exomiser_command = ""
 4959
 4960                # Command exomiser options
 4961                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4962
 4963                # Release
 4964                exomiser_release = param_exomiser.get("release", None)
 4965                if exomiser_release:
 4966                    # phenotype data version
 4967                    exomiser_options += (
 4968                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4969                    )
 4970                    # data version
 4971                    exomiser_options += (
 4972                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4973                    )
 4974                    # variant white list
 4975                    variant_white_list_file = (
 4976                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4977                    )
 4978                    if os.path.exists(
 4979                        os.path.join(
 4980                            databases_folders, assembly, variant_white_list_file
 4981                        )
 4982                    ):
 4983                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4984
 4985                # transcript_source
 4986                transcript_source = param_exomiser.get(
 4987                    "transcript_source", None
 4988                )  # ucsc, refseq, ensembl
 4989                if transcript_source:
 4990                    exomiser_options += (
 4991                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4992                    )
 4993
 4994                # If analysis contain proband param
 4995                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4996                    "proband", {}
 4997                ):
 4998                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4999
 5000                # If no proband (usually uniq sample)
 5001                else:
 5002                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5003
 5004                # Log
 5005                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5006
 5007                # Run command
 5008                result = subprocess.call(
 5009                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5010                )
 5011                if result:
 5012                    log.error("Exomiser command failed")
 5013                    raise ValueError("Exomiser command failed")
 5014
 5015                ### RESULTS ###
 5016                ###############
 5017
 5018                ### Annotate with TSV fields ###
 5019
 5020                # Init result tsv file
 5021                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5022
 5023                # Init result tsv file
 5024                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5025
 5026                # Parse TSV file and explode columns in INFO field
 5027                if exomiser_to_info and os.path.exists(output_results_tsv):
 5028
 5029                    # Log
 5030                    log.debug("Exomiser columns to VCF INFO field")
 5031
 5032                    # Retrieve columns and types
 5033                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5034                    output_results_tsv_df = self.get_query_to_df(query)
 5035                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5036
 5037                    # Init concat fields for update
 5038                    sql_query_update_concat_fields = []
 5039
 5040                    # Fields to avoid
 5041                    fields_to_avoid = [
 5042                        "CONTIG",
 5043                        "START",
 5044                        "END",
 5045                        "REF",
 5046                        "ALT",
 5047                        "QUAL",
 5048                        "FILTER",
 5049                        "GENOTYPE",
 5050                    ]
 5051
 5052                    # List all columns to add into header
 5053                    for header_column in output_results_tsv_columns:
 5054
 5055                        # If header column is enable
 5056                        if header_column not in fields_to_avoid:
 5057
 5058                            # Header info type
 5059                            header_info_type = "String"
 5060                            header_column_df = output_results_tsv_df[header_column]
 5061                            header_column_df_dtype = header_column_df.dtype
 5062                            if header_column_df_dtype == object:
 5063                                if (
 5064                                    pd.to_numeric(header_column_df, errors="coerce")
 5065                                    .notnull()
 5066                                    .all()
 5067                                ):
 5068                                    header_info_type = "Float"
 5069                            else:
 5070                                header_info_type = "Integer"
 5071
 5072                            # Header info
 5073                            characters_to_validate = ["-"]
 5074                            pattern = "[" + "".join(characters_to_validate) + "]"
 5075                            header_info_name = re.sub(
 5076                                pattern,
 5077                                "_",
 5078                                f"Exomiser_{header_column}".replace("#", ""),
 5079                            )
 5080                            header_info_number = "."
 5081                            header_info_description = (
 5082                                f"Exomiser {header_column} annotation"
 5083                            )
 5084                            header_info_source = "Exomiser"
 5085                            header_info_version = "unknown"
 5086                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5087                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5088                                header_info_name,
 5089                                header_info_number,
 5090                                header_info_type,
 5091                                header_info_description,
 5092                                header_info_source,
 5093                                header_info_version,
 5094                                header_info_code,
 5095                            )
 5096
 5097                            # Add field to add for update to concat fields
 5098                            sql_query_update_concat_fields.append(
 5099                                f"""
 5100                                CASE
 5101                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5102                                    THEN concat(
 5103                                        '{header_info_name}=',
 5104                                        table_parquet."{header_column}",
 5105                                        ';'
 5106                                        )
 5107
 5108                                    ELSE ''
 5109                                END
 5110                            """
 5111                            )
 5112
 5113                    # Update query
 5114                    sql_query_update = f"""
 5115                        UPDATE {table_variants} as table_variants
 5116                            SET INFO = concat(
 5117                                            CASE
 5118                                                WHEN INFO NOT IN ('', '.')
 5119                                                THEN INFO
 5120                                                ELSE ''
 5121                                            END,
 5122                                            CASE
 5123                                                WHEN table_variants.INFO NOT IN ('','.')
 5124                                                THEN ';'
 5125                                                ELSE ''
 5126                                            END,
 5127                                            (
 5128                                            SELECT 
 5129                                                concat(
 5130                                                    {",".join(sql_query_update_concat_fields)}
 5131                                                )
 5132                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5133                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5134                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5135                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5136                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5137                                            )
 5138                                        )
 5139                            ;
 5140                        """
 5141
 5142                    # Update
 5143                    self.conn.execute(sql_query_update)
 5144
 5145                ### Annotate with VCF INFO field ###
 5146
 5147                # Init result VCF file
 5148                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5149
 5150                # If VCF exists
 5151                if os.path.exists(output_results_vcf):
 5152
 5153                    # Log
 5154                    log.debug("Exomiser result VCF update variants")
 5155
 5156                    # Find Exomiser INFO field annotation in header
 5157                    with gzip.open(output_results_vcf, "rt") as f:
 5158                        header_list = self.read_vcf_header(f)
 5159                    exomiser_vcf_header = vcf.Reader(
 5160                        io.StringIO("\n".join(header_list))
 5161                    )
 5162
 5163                    # Add annotation INFO field to header
 5164                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5165
 5166                    # Update variants with VCF
 5167                    self.update_from_vcf(output_results_vcf)
 5168
 5169        return True
 5170
 5171    def annotation_snpeff(self, threads: int = None) -> None:
 5172        """
 5173        This function annotate with snpEff
 5174
 5175        :param threads: The number of threads to use
 5176        :return: the value of the variable "return_value".
 5177        """
 5178
 5179        # DEBUG
 5180        log.debug("Start annotation with snpeff databases")
 5181
 5182        # Threads
 5183        if not threads:
 5184            threads = self.get_threads()
 5185        log.debug("Threads: " + str(threads))
 5186
 5187        # DEBUG
 5188        delete_tmp = True
 5189        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5190            delete_tmp = False
 5191            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5192
 5193        # Config
 5194        config = self.get_config()
 5195        log.debug("Config: " + str(config))
 5196
 5197        # Config - Folders - Databases
 5198        databases_folders = (
 5199            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5200        )
 5201        log.debug("Databases annotations: " + str(databases_folders))
 5202
 5203        # Config - snpEff bin command
 5204        snpeff_bin_command = get_bin_command(
 5205            bin="snpEff.jar",
 5206            tool="snpeff",
 5207            bin_type="jar",
 5208            config=config,
 5209            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5210        )
 5211        if not snpeff_bin_command:
 5212            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5213            log.error(msg_err)
 5214            raise ValueError(msg_err)
 5215
 5216        # Config - snpEff databases
 5217        snpeff_databases = (
 5218            config.get("folders", {})
 5219            .get("databases", {})
 5220            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5221        )
 5222        snpeff_databases = full_path(snpeff_databases)
 5223        if snpeff_databases is not None and snpeff_databases != "":
 5224            log.debug(f"Create snpEff databases folder")
 5225            if not os.path.exists(snpeff_databases):
 5226                os.makedirs(snpeff_databases)
 5227
 5228        # Param
 5229        param = self.get_param()
 5230        log.debug("Param: " + str(param))
 5231
 5232        # Param
 5233        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5234        log.debug("Options: " + str(options))
 5235
 5236        # Param - Assembly
 5237        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5238
 5239        # Param - Options
 5240        snpeff_options = (
 5241            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5242        )
 5243        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5244        snpeff_csvstats = (
 5245            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5246        )
 5247        if snpeff_stats:
 5248            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5249            snpeff_stats = full_path(snpeff_stats)
 5250            snpeff_options += f" -stats {snpeff_stats}"
 5251        if snpeff_csvstats:
 5252            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5253            snpeff_csvstats = full_path(snpeff_csvstats)
 5254            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5255
 5256        # Data
 5257        table_variants = self.get_table_variants()
 5258
 5259        # Check if not empty
 5260        log.debug("Check if not empty")
 5261        sql_query_chromosomes = (
 5262            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5263        )
 5264        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5265        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5266            log.info(f"VCF empty")
 5267            return
 5268
 5269        # Export in VCF
 5270        log.debug("Create initial file to annotate")
 5271        tmp_vcf = NamedTemporaryFile(
 5272            prefix=self.get_prefix(),
 5273            dir=self.get_tmp_dir(),
 5274            suffix=".vcf.gz",
 5275            delete=True,
 5276        )
 5277        tmp_vcf_name = tmp_vcf.name
 5278
 5279        # VCF header
 5280        vcf_reader = self.get_header()
 5281        log.debug("Initial header: " + str(vcf_reader.infos))
 5282
 5283        # Existing annotations
 5284        for vcf_annotation in self.get_header().infos:
 5285
 5286            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5287            log.debug(
 5288                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5289            )
 5290
 5291        # Memory limit
 5292        # if config.get("memory", None):
 5293        #     memory_limit = config.get("memory", "8G")
 5294        # else:
 5295        #     memory_limit = "8G"
 5296        memory_limit = self.get_memory("8G")
 5297        log.debug(f"memory_limit: {memory_limit}")
 5298
 5299        # snpEff java options
 5300        snpeff_java_options = (
 5301            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5302        )
 5303        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5304
 5305        force_update_annotation = True
 5306
 5307        if "ANN" not in self.get_header().infos or force_update_annotation:
 5308
 5309            # Check snpEff database
 5310            log.debug(f"Check snpEff databases {[assembly]}")
 5311            databases_download_snpeff(
 5312                folder=snpeff_databases, assemblies=[assembly], config=config
 5313            )
 5314
 5315            # Export VCF file
 5316            self.export_variant_vcf(
 5317                vcf_file=tmp_vcf_name,
 5318                remove_info=True,
 5319                add_samples=False,
 5320                index=True,
 5321            )
 5322
 5323            # Tmp file
 5324            err_files = []
 5325            tmp_annotate_vcf = NamedTemporaryFile(
 5326                prefix=self.get_prefix(),
 5327                dir=self.get_tmp_dir(),
 5328                suffix=".vcf",
 5329                delete=False,
 5330            )
 5331            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5332            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5333            err_files.append(tmp_annotate_vcf_name_err)
 5334
 5335            # Command
 5336            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5337            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5338            run_parallel_commands([snpeff_command], 1)
 5339
 5340            # Error messages
 5341            log.info(f"Error/Warning messages:")
 5342            error_message_command_all = []
 5343            error_message_command_warning = []
 5344            error_message_command_err = []
 5345            for err_file in err_files:
 5346                with open(err_file, "r") as f:
 5347                    for line in f:
 5348                        message = line.strip()
 5349                        error_message_command_all.append(message)
 5350                        if line.startswith("[W::"):
 5351                            error_message_command_warning.append(message)
 5352                        if line.startswith("[E::"):
 5353                            error_message_command_err.append(f"{err_file}: " + message)
 5354            # log info
 5355            for message in list(
 5356                set(error_message_command_err + error_message_command_warning)
 5357            ):
 5358                log.info(f"   {message}")
 5359            # debug info
 5360            for message in list(set(error_message_command_all)):
 5361                log.debug(f"   {message}")
 5362            # failed
 5363            if len(error_message_command_err):
 5364                log.error("Annotation failed: Error in commands")
 5365                raise ValueError("Annotation failed: Error in commands")
 5366
 5367            # Find annotation in header
 5368            with open(tmp_annotate_vcf_name, "rt") as f:
 5369                header_list = self.read_vcf_header(f)
 5370            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5371
 5372            for ann in annovar_vcf_header.infos:
 5373                if ann not in self.get_header().infos:
 5374                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5375
 5376            # Update variants
 5377            log.info(f"Annotation - Updating...")
 5378            self.update_from_vcf(tmp_annotate_vcf_name)
 5379
 5380        else:
 5381            if "ANN" in self.get_header().infos:
 5382                log.debug(f"Existing snpEff annotations in VCF")
 5383            if force_update_annotation:
 5384                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5385
 5386    def annotation_annovar(self, threads: int = None) -> None:
 5387        """
 5388        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5389        annotations
 5390
 5391        :param threads: number of threads to use
 5392        :return: the value of the variable "return_value".
 5393        """
 5394
 5395        # DEBUG
 5396        log.debug("Start annotation with Annovar databases")
 5397
 5398        # Threads
 5399        if not threads:
 5400            threads = self.get_threads()
 5401        log.debug("Threads: " + str(threads))
 5402
 5403        # Tmp en Err files
 5404        tmp_files = []
 5405        err_files = []
 5406
 5407        # DEBUG
 5408        delete_tmp = True
 5409        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5410            delete_tmp = False
 5411            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5412
 5413        # Config
 5414        config = self.get_config()
 5415        log.debug("Config: " + str(config))
 5416
 5417        # Config - Folders - Databases
 5418        databases_folders = (
 5419            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5420        )
 5421        log.debug("Databases annotations: " + str(databases_folders))
 5422
 5423        # Config - annovar bin command
 5424        annovar_bin_command = get_bin_command(
 5425            bin="table_annovar.pl",
 5426            tool="annovar",
 5427            bin_type="perl",
 5428            config=config,
 5429            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5430        )
 5431        if not annovar_bin_command:
 5432            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5433            log.error(msg_err)
 5434            raise ValueError(msg_err)
 5435
 5436        # Config - BCFTools bin command
 5437        bcftools_bin_command = get_bin_command(
 5438            bin="bcftools",
 5439            tool="bcftools",
 5440            bin_type="bin",
 5441            config=config,
 5442            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5443        )
 5444        if not bcftools_bin_command:
 5445            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5446            log.error(msg_err)
 5447            raise ValueError(msg_err)
 5448
 5449        # Config - annovar databases
 5450        annovar_databases = (
 5451            config.get("folders", {})
 5452            .get("databases", {})
 5453            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5454        )
 5455        if annovar_databases is not None:
 5456            if isinstance(annovar_databases, list):
 5457                annovar_databases = full_path(annovar_databases[0])
 5458                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5459            annovar_databases = full_path(annovar_databases)
 5460            if not os.path.exists(annovar_databases):
 5461                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5462                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5463        else:
 5464            msg_err = f"Annovar databases configuration failed"
 5465            log.error(msg_err)
 5466            raise ValueError(msg_err)
 5467
 5468        # Param
 5469        param = self.get_param()
 5470        log.debug("Param: " + str(param))
 5471
 5472        # Param - options
 5473        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5474        log.debug("Options: " + str(options))
 5475
 5476        # Param - annotations
 5477        annotations = (
 5478            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5479        )
 5480        log.debug("Annotations: " + str(annotations))
 5481
 5482        # Param - Assembly
 5483        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5484
 5485        # Annovar database assembly
 5486        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5487        if annovar_databases_assembly != "" and not os.path.exists(
 5488            annovar_databases_assembly
 5489        ):
 5490            os.makedirs(annovar_databases_assembly)
 5491
 5492        # Data
 5493        table_variants = self.get_table_variants()
 5494
 5495        # Check if not empty
 5496        log.debug("Check if not empty")
 5497        sql_query_chromosomes = (
 5498            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5499        )
 5500        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5501        if not sql_query_chromosomes_df["count"][0]:
 5502            log.info(f"VCF empty")
 5503            return
 5504
 5505        # VCF header
 5506        vcf_reader = self.get_header()
 5507        log.debug("Initial header: " + str(vcf_reader.infos))
 5508
 5509        # Existing annotations
 5510        for vcf_annotation in self.get_header().infos:
 5511
 5512            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5513            log.debug(
 5514                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5515            )
 5516
 5517        force_update_annotation = True
 5518
 5519        if annotations:
 5520
 5521            commands = []
 5522            tmp_annotates_vcf_name_list = []
 5523
 5524            # Export in VCF
 5525            log.debug("Create initial file to annotate")
 5526            tmp_vcf = NamedTemporaryFile(
 5527                prefix=self.get_prefix(),
 5528                dir=self.get_tmp_dir(),
 5529                suffix=".vcf.gz",
 5530                delete=False,
 5531            )
 5532            tmp_vcf_name = tmp_vcf.name
 5533            tmp_files.append(tmp_vcf_name)
 5534            tmp_files.append(tmp_vcf_name + ".tbi")
 5535
 5536            # Export VCF file
 5537            self.export_variant_vcf(
 5538                vcf_file=tmp_vcf_name,
 5539                remove_info=".",
 5540                add_samples=False,
 5541                index=True,
 5542            )
 5543
 5544            # Create file for field rename
 5545            log.debug("Create file for field rename")
 5546            tmp_rename = NamedTemporaryFile(
 5547                prefix=self.get_prefix(),
 5548                dir=self.get_tmp_dir(),
 5549                suffix=".rename",
 5550                delete=False,
 5551            )
 5552            tmp_rename_name = tmp_rename.name
 5553            tmp_files.append(tmp_rename_name)
 5554
 5555            # Check Annovar database
 5556            log.debug(
 5557                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5558            )
 5559            databases_download_annovar(
 5560                folder=annovar_databases,
 5561                files=list(annotations.keys()),
 5562                assemblies=[assembly],
 5563            )
 5564
 5565            for annotation in annotations:
 5566                annotation_fields = annotations[annotation]
 5567
 5568                if not annotation_fields:
 5569                    annotation_fields = {"INFO": None}
 5570
 5571                log.info(f"Annotations Annovar - database '{annotation}'")
 5572                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5573
 5574                # Tmp file for annovar
 5575                err_files = []
 5576                tmp_annotate_vcf_directory = TemporaryDirectory(
 5577                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5578                )
 5579                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5580                tmp_annotate_vcf_name_annovar = (
 5581                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5582                )
 5583                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5584                err_files.append(tmp_annotate_vcf_name_err)
 5585                tmp_files.append(tmp_annotate_vcf_name_err)
 5586
 5587                # Tmp file final vcf annotated by annovar
 5588                tmp_annotate_vcf = NamedTemporaryFile(
 5589                    prefix=self.get_prefix(),
 5590                    dir=self.get_tmp_dir(),
 5591                    suffix=".vcf.gz",
 5592                    delete=False,
 5593                )
 5594                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5595                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5596                tmp_files.append(tmp_annotate_vcf_name)
 5597                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5598
 5599                # Number of fields
 5600                annotation_list = []
 5601                annotation_renamed_list = []
 5602
 5603                for annotation_field in annotation_fields:
 5604
 5605                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5606                    annotation_fields_new_name = annotation_fields.get(
 5607                        annotation_field, annotation_field
 5608                    )
 5609                    if not annotation_fields_new_name:
 5610                        annotation_fields_new_name = annotation_field
 5611
 5612                    if (
 5613                        force_update_annotation
 5614                        or annotation_fields_new_name not in self.get_header().infos
 5615                    ):
 5616                        annotation_list.append(annotation_field)
 5617                        annotation_renamed_list.append(annotation_fields_new_name)
 5618                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5619                        log.warning(
 5620                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5621                        )
 5622
 5623                    # Add rename info
 5624                    run_parallel_commands(
 5625                        [
 5626                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5627                        ],
 5628                        1,
 5629                    )
 5630
 5631                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5632                log.debug("annotation_list: " + str(annotation_list))
 5633
 5634                # protocol
 5635                protocol = annotation
 5636
 5637                # argument
 5638                argument = ""
 5639
 5640                # operation
 5641                operation = "f"
 5642                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5643                    "ensGene"
 5644                ):
 5645                    operation = "g"
 5646                    if options.get("genebase", None):
 5647                        argument = f"""'{options.get("genebase","")}'"""
 5648                elif annotation in ["cytoBand"]:
 5649                    operation = "r"
 5650
 5651                # argument option
 5652                argument_option = ""
 5653                if argument != "":
 5654                    argument_option = " --argument " + argument
 5655
 5656                # command options
 5657                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5658                for option in options:
 5659                    if option not in ["genebase"]:
 5660                        command_options += f""" --{option}={options[option]}"""
 5661
 5662                # Command
 5663
 5664                # Command - Annovar
 5665                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5666                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5667
 5668                # Command - start pipe
 5669                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5670
 5671                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5672                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5673
 5674                # Command - Special characters (refGene annotation)
 5675                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5676
 5677                # Command - Clean empty fields (with value ".")
 5678                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5679
 5680                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5681                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5682                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5683                    # for ann in annotation_renamed_list:
 5684                    for ann in annotation_list:
 5685                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5686
 5687                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5688
 5689                # Command - indexing
 5690                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5691
 5692                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5693                run_parallel_commands([command_annovar], 1)
 5694
 5695                # Error messages
 5696                log.info(f"Error/Warning messages:")
 5697                error_message_command_all = []
 5698                error_message_command_warning = []
 5699                error_message_command_err = []
 5700                for err_file in err_files:
 5701                    with open(err_file, "r") as f:
 5702                        for line in f:
 5703                            message = line.strip()
 5704                            error_message_command_all.append(message)
 5705                            if line.startswith("[W::") or line.startswith("WARNING"):
 5706                                error_message_command_warning.append(message)
 5707                            if line.startswith("[E::") or line.startswith("ERROR"):
 5708                                error_message_command_err.append(
 5709                                    f"{err_file}: " + message
 5710                                )
 5711                # log info
 5712                for message in list(
 5713                    set(error_message_command_err + error_message_command_warning)
 5714                ):
 5715                    log.info(f"   {message}")
 5716                # debug info
 5717                for message in list(set(error_message_command_all)):
 5718                    log.debug(f"   {message}")
 5719                # failed
 5720                if len(error_message_command_err):
 5721                    log.error("Annotation failed: Error in commands")
 5722                    raise ValueError("Annotation failed: Error in commands")
 5723
 5724            if tmp_annotates_vcf_name_list:
 5725
 5726                # List of annotated files
 5727                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5728
 5729                # Tmp file
 5730                tmp_annotate_vcf = NamedTemporaryFile(
 5731                    prefix=self.get_prefix(),
 5732                    dir=self.get_tmp_dir(),
 5733                    suffix=".vcf.gz",
 5734                    delete=False,
 5735                )
 5736                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5737                tmp_files.append(tmp_annotate_vcf_name)
 5738                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5739                err_files.append(tmp_annotate_vcf_name_err)
 5740                tmp_files.append(tmp_annotate_vcf_name_err)
 5741
 5742                # Command merge
 5743                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5744                log.info(
 5745                    f"Annotation Annovar - Annotation merging "
 5746                    + str(len(tmp_annotates_vcf_name_list))
 5747                    + " annotated files"
 5748                )
 5749                log.debug(f"Annotation - merge command: {merge_command}")
 5750                run_parallel_commands([merge_command], 1)
 5751
 5752                # Find annotation in header
 5753                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5754                    header_list = self.read_vcf_header(f)
 5755                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5756
 5757                for ann in annovar_vcf_header.infos:
 5758                    if ann not in self.get_header().infos:
 5759                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5760
 5761                # Update variants
 5762                log.info(f"Annotation Annovar - Updating...")
 5763                self.update_from_vcf(tmp_annotate_vcf_name)
 5764
 5765            # Clean files
 5766            # Tmp file remove command
 5767            if True:
 5768                tmp_files_remove_command = ""
 5769                if tmp_files:
 5770                    tmp_files_remove_command = " ".join(tmp_files)
 5771                clean_command = f" rm -f {tmp_files_remove_command} "
 5772                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5773                log.debug(f"Annotation - cleaning command: {clean_command}")
 5774                run_parallel_commands([clean_command], 1)
 5775
 5776    # Parquet
 5777    def annotation_parquet(self, threads: int = None) -> None:
 5778        """
 5779        It takes a VCF file, and annotates it with a parquet file
 5780
 5781        :param threads: number of threads to use for the annotation
 5782        :return: the value of the variable "result".
 5783        """
 5784
 5785        # DEBUG
 5786        log.debug("Start annotation with parquet databases")
 5787
 5788        # Threads
 5789        if not threads:
 5790            threads = self.get_threads()
 5791        log.debug("Threads: " + str(threads))
 5792
 5793        # DEBUG
 5794        delete_tmp = True
 5795        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5796            delete_tmp = False
 5797            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5798
 5799        # Config
 5800        databases_folders = set(
 5801            self.get_config()
 5802            .get("folders", {})
 5803            .get("databases", {})
 5804            .get("annotations", ["."])
 5805            + self.get_config()
 5806            .get("folders", {})
 5807            .get("databases", {})
 5808            .get("parquet", ["."])
 5809        )
 5810        log.debug("Databases annotations: " + str(databases_folders))
 5811
 5812        # Param
 5813        annotations = (
 5814            self.get_param()
 5815            .get("annotation", {})
 5816            .get("parquet", {})
 5817            .get("annotations", None)
 5818        )
 5819        log.debug("Annotations: " + str(annotations))
 5820
 5821        # Assembly
 5822        assembly = self.get_param().get(
 5823            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5824        )
 5825
 5826        # Force Update Annotation
 5827        force_update_annotation = (
 5828            self.get_param()
 5829            .get("annotation", {})
 5830            .get("options", {})
 5831            .get("annotations_update", False)
 5832        )
 5833        log.debug(f"force_update_annotation={force_update_annotation}")
 5834        force_append_annotation = (
 5835            self.get_param()
 5836            .get("annotation", {})
 5837            .get("options", {})
 5838            .get("annotations_append", False)
 5839        )
 5840        log.debug(f"force_append_annotation={force_append_annotation}")
 5841
 5842        # Data
 5843        table_variants = self.get_table_variants()
 5844
 5845        # Check if not empty
 5846        log.debug("Check if not empty")
 5847        sql_query_chromosomes_df = self.get_query_to_df(
 5848            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5849        )
 5850        if not sql_query_chromosomes_df["count"][0]:
 5851            log.info(f"VCF empty")
 5852            return
 5853
 5854        # VCF header
 5855        vcf_reader = self.get_header()
 5856        log.debug("Initial header: " + str(vcf_reader.infos))
 5857
 5858        # Nb Variants POS
 5859        log.debug("NB Variants Start")
 5860        nb_variants = self.conn.execute(
 5861            f"SELECT count(*) AS count FROM variants"
 5862        ).fetchdf()["count"][0]
 5863        log.debug("NB Variants Stop")
 5864
 5865        # Existing annotations
 5866        for vcf_annotation in self.get_header().infos:
 5867
 5868            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5869            log.debug(
 5870                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5871            )
 5872
 5873        # Added columns
 5874        added_columns = []
 5875
 5876        # drop indexes
 5877        log.debug(f"Drop indexes...")
 5878        self.drop_indexes()
 5879
 5880        if annotations:
 5881
 5882            if "ALL" in annotations:
 5883
 5884                all_param = annotations.get("ALL", {})
 5885                all_param_formats = all_param.get("formats", None)
 5886                all_param_releases = all_param.get("releases", None)
 5887
 5888                databases_infos_dict = self.scan_databases(
 5889                    database_formats=all_param_formats,
 5890                    database_releases=all_param_releases,
 5891                )
 5892                for database_infos in databases_infos_dict.keys():
 5893                    if database_infos not in annotations:
 5894                        annotations[database_infos] = {"INFO": None}
 5895
 5896            for annotation in annotations:
 5897
 5898                if annotation in ["ALL"]:
 5899                    continue
 5900
 5901                # Annotation Name
 5902                annotation_name = os.path.basename(annotation)
 5903
 5904                # Annotation fields
 5905                annotation_fields = annotations[annotation]
 5906                if not annotation_fields:
 5907                    annotation_fields = {"INFO": None}
 5908
 5909                log.debug(f"Annotation '{annotation_name}'")
 5910                log.debug(
 5911                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5912                )
 5913
 5914                # Create Database
 5915                database = Database(
 5916                    database=annotation,
 5917                    databases_folders=databases_folders,
 5918                    assembly=assembly,
 5919                )
 5920
 5921                # Find files
 5922                parquet_file = database.get_database()
 5923                parquet_hdr_file = database.get_header_file()
 5924                parquet_type = database.get_type()
 5925
 5926                # Check if files exists
 5927                if not parquet_file or not parquet_hdr_file:
 5928                    msg_err_list = []
 5929                    if not parquet_file:
 5930                        msg_err_list.append(
 5931                            f"Annotation failed: Annotation file not found"
 5932                        )
 5933                    if parquet_file and not parquet_hdr_file:
 5934                        msg_err_list.append(
 5935                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 5936                        )
 5937
 5938                    log.error(". ".join(msg_err_list))
 5939                    raise ValueError(". ".join(msg_err_list))
 5940                else:
 5941                    # Get parquet connexion
 5942                    parquet_sql_attach = database.get_sql_database_attach(
 5943                        output="query"
 5944                    )
 5945                    if parquet_sql_attach:
 5946                        self.conn.execute(parquet_sql_attach)
 5947                    parquet_file_link = database.get_sql_database_link()
 5948                    # Log
 5949                    log.debug(
 5950                        f"Annotation '{annotation_name}' - file: "
 5951                        + str(parquet_file)
 5952                        + " and "
 5953                        + str(parquet_hdr_file)
 5954                    )
 5955
 5956                    # Database full header columns
 5957                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5958                        parquet_hdr_file
 5959                    )
 5960                    # Log
 5961                    log.debug(
 5962                        "Annotation database header columns : "
 5963                        + str(parquet_hdr_vcf_header_columns)
 5964                    )
 5965
 5966                    # Load header as VCF object
 5967                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5968                    # Log
 5969                    log.debug(
 5970                        "Annotation database header: "
 5971                        + str(parquet_hdr_vcf_header_infos)
 5972                    )
 5973
 5974                    # Get extra infos
 5975                    parquet_columns = database.get_extra_columns()
 5976                    # Log
 5977                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5978
 5979                    # Add extra columns if "ALL" in annotation_fields
 5980                    # if "ALL" in annotation_fields:
 5981                    #     allow_add_extra_column = True
 5982                    if "ALL" in annotation_fields and database.get_extra_columns():
 5983                        for extra_column in database.get_extra_columns():
 5984                            if (
 5985                                extra_column not in annotation_fields
 5986                                and extra_column.replace("INFO/", "")
 5987                                not in parquet_hdr_vcf_header_infos
 5988                            ):
 5989                                parquet_hdr_vcf_header_infos[extra_column] = (
 5990                                    vcf.parser._Info(
 5991                                        extra_column,
 5992                                        ".",
 5993                                        "String",
 5994                                        f"{extra_column} description",
 5995                                        "unknown",
 5996                                        "unknown",
 5997                                        self.code_type_map["String"],
 5998                                    )
 5999                                )
 6000
 6001                    # For all fields in database
 6002                    annotation_fields_all = False
 6003                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6004                        annotation_fields_all = True
 6005                        annotation_fields = {
 6006                            key: key for key in parquet_hdr_vcf_header_infos
 6007                        }
 6008
 6009                        log.debug(
 6010                            "Annotation database header - All annotations added: "
 6011                            + str(annotation_fields)
 6012                        )
 6013
 6014                    # Init
 6015
 6016                    # List of annotation fields to use
 6017                    sql_query_annotation_update_info_sets = []
 6018
 6019                    # List of annotation to agregate
 6020                    sql_query_annotation_to_agregate = []
 6021
 6022                    # Number of fields
 6023                    nb_annotation_field = 0
 6024
 6025                    # Annotation fields processed
 6026                    annotation_fields_processed = []
 6027
 6028                    # Columns mapping
 6029                    map_columns = database.map_columns(
 6030                        columns=annotation_fields, prefixes=["INFO/"]
 6031                    )
 6032
 6033                    # Query dict for fields to remove (update option)
 6034                    query_dict_remove = {}
 6035
 6036                    # Fetch Anotation fields
 6037                    for annotation_field in annotation_fields:
 6038
 6039                        # annotation_field_column
 6040                        annotation_field_column = map_columns.get(
 6041                            annotation_field, "INFO"
 6042                        )
 6043
 6044                        # field new name, if parametered
 6045                        annotation_fields_new_name = annotation_fields.get(
 6046                            annotation_field, annotation_field
 6047                        )
 6048                        if not annotation_fields_new_name:
 6049                            annotation_fields_new_name = annotation_field
 6050
 6051                        # To annotate
 6052                        # force_update_annotation = True
 6053                        # force_append_annotation = True
 6054                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6055                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6056                            force_update_annotation
 6057                            or force_append_annotation
 6058                            or (
 6059                                annotation_fields_new_name
 6060                                not in self.get_header().infos
 6061                            )
 6062                        ):
 6063
 6064                            # Add field to annotation to process list
 6065                            annotation_fields_processed.append(
 6066                                annotation_fields_new_name
 6067                            )
 6068
 6069                            # explode infos for the field
 6070                            annotation_fields_new_name_info_msg = ""
 6071                            if (
 6072                                force_update_annotation
 6073                                and annotation_fields_new_name
 6074                                in self.get_header().infos
 6075                            ):
 6076                                # Remove field from INFO
 6077                                query = f"""
 6078                                    UPDATE {table_variants} as table_variants
 6079                                    SET INFO = REGEXP_REPLACE(
 6080                                                concat(table_variants.INFO,''),
 6081                                                ';*{annotation_fields_new_name}=[^;]*',
 6082                                                ''
 6083                                                )
 6084                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6085                                """
 6086                                annotation_fields_new_name_info_msg = " [update]"
 6087                                query_dict_remove[
 6088                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6089                                ] = query
 6090
 6091                            # Sep between fields in INFO
 6092                            nb_annotation_field += 1
 6093                            if nb_annotation_field > 1:
 6094                                annotation_field_sep = ";"
 6095                            else:
 6096                                annotation_field_sep = ""
 6097
 6098                            log.info(
 6099                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6100                            )
 6101
 6102                            # Add INFO field to header
 6103                            parquet_hdr_vcf_header_infos_number = (
 6104                                parquet_hdr_vcf_header_infos[annotation_field].num
 6105                                or "."
 6106                            )
 6107                            parquet_hdr_vcf_header_infos_type = (
 6108                                parquet_hdr_vcf_header_infos[annotation_field].type
 6109                                or "String"
 6110                            )
 6111                            parquet_hdr_vcf_header_infos_description = (
 6112                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6113                                or f"{annotation_field} description"
 6114                            )
 6115                            parquet_hdr_vcf_header_infos_source = (
 6116                                parquet_hdr_vcf_header_infos[annotation_field].source
 6117                                or "unknown"
 6118                            )
 6119                            parquet_hdr_vcf_header_infos_version = (
 6120                                parquet_hdr_vcf_header_infos[annotation_field].version
 6121                                or "unknown"
 6122                            )
 6123
 6124                            vcf_reader.infos[annotation_fields_new_name] = (
 6125                                vcf.parser._Info(
 6126                                    annotation_fields_new_name,
 6127                                    parquet_hdr_vcf_header_infos_number,
 6128                                    parquet_hdr_vcf_header_infos_type,
 6129                                    parquet_hdr_vcf_header_infos_description,
 6130                                    parquet_hdr_vcf_header_infos_source,
 6131                                    parquet_hdr_vcf_header_infos_version,
 6132                                    self.code_type_map[
 6133                                        parquet_hdr_vcf_header_infos_type
 6134                                    ],
 6135                                )
 6136                            )
 6137
 6138                            # Append
 6139                            if force_append_annotation:
 6140                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6141                            else:
 6142                                query_case_when_append = ""
 6143
 6144                            # Annotation/Update query fields
 6145                            # Found in INFO column
 6146                            if (
 6147                                annotation_field_column == "INFO"
 6148                                and "INFO" in parquet_hdr_vcf_header_columns
 6149                            ):
 6150                                sql_query_annotation_update_info_sets.append(
 6151                                    f"""
 6152                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6153                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6154                                        ELSE ''
 6155                                    END
 6156                                """
 6157                                )
 6158                            # Found in a specific column
 6159                            else:
 6160                                sql_query_annotation_update_info_sets.append(
 6161                                    f"""
 6162                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6163                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6164                                        ELSE ''
 6165                                    END
 6166                                """
 6167                                )
 6168                                sql_query_annotation_to_agregate.append(
 6169                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6170                                )
 6171
 6172                        # Not to annotate
 6173                        else:
 6174
 6175                            if force_update_annotation:
 6176                                annotation_message = "forced"
 6177                            else:
 6178                                annotation_message = "skipped"
 6179
 6180                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6181                                log.warning(
 6182                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6183                                )
 6184                            if annotation_fields_new_name in self.get_header().infos:
 6185                                log.warning(
 6186                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6187                                )
 6188
 6189                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6190                    # allow_annotation_full_info = True
 6191                    allow_annotation_full_info = not force_append_annotation
 6192
 6193                    if parquet_type in ["regions"]:
 6194                        allow_annotation_full_info = False
 6195
 6196                    if (
 6197                        allow_annotation_full_info
 6198                        and nb_annotation_field == len(annotation_fields)
 6199                        and annotation_fields_all
 6200                        and (
 6201                            "INFO" in parquet_hdr_vcf_header_columns
 6202                            and "INFO" in database.get_extra_columns()
 6203                        )
 6204                    ):
 6205                        log.debug("Column INFO annotation enabled")
 6206                        sql_query_annotation_update_info_sets = []
 6207                        sql_query_annotation_update_info_sets.append(
 6208                            f" table_parquet.INFO "
 6209                        )
 6210
 6211                    if sql_query_annotation_update_info_sets:
 6212
 6213                        # Annotate
 6214                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6215
 6216                        # Join query annotation update info sets for SQL
 6217                        sql_query_annotation_update_info_sets_sql = ",".join(
 6218                            sql_query_annotation_update_info_sets
 6219                        )
 6220
 6221                        # Check chromosomes list (and variants infos)
 6222                        sql_query_chromosomes = f"""
 6223                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6224                            FROM {table_variants} as table_variants
 6225                            GROUP BY table_variants."#CHROM"
 6226                            ORDER BY table_variants."#CHROM"
 6227                            """
 6228                        sql_query_chromosomes_df = self.conn.execute(
 6229                            sql_query_chromosomes
 6230                        ).df()
 6231                        sql_query_chromosomes_dict = {
 6232                            entry["CHROM"]: {
 6233                                "count": entry["count_variants"],
 6234                                "min": entry["min_variants"],
 6235                                "max": entry["max_variants"],
 6236                            }
 6237                            for index, entry in sql_query_chromosomes_df.iterrows()
 6238                        }
 6239
 6240                        # Init
 6241                        nb_of_query = 0
 6242                        nb_of_variant_annotated = 0
 6243                        query_dict = query_dict_remove
 6244
 6245                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6246                        for chrom in sql_query_chromosomes_dict:
 6247
 6248                            # Number of variant by chromosome
 6249                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6250                                chrom, {}
 6251                            ).get("count", 0)
 6252
 6253                            log.debug(
 6254                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6255                            )
 6256
 6257                            # Annotation with regions database
 6258                            if parquet_type in ["regions"]:
 6259                                sql_query_annotation_from_clause = f"""
 6260                                    FROM (
 6261                                        SELECT 
 6262                                            '{chrom}' AS \"#CHROM\",
 6263                                            table_variants_from.\"POS\" AS \"POS\",
 6264                                            {",".join(sql_query_annotation_to_agregate)}
 6265                                        FROM {table_variants} as table_variants_from
 6266                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6267                                            table_parquet_from."#CHROM" = '{chrom}'
 6268                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6269                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 6270                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6271                                                )
 6272                                        )
 6273                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6274                                        GROUP BY table_variants_from.\"POS\"
 6275                                        )
 6276                                        as table_parquet
 6277                                """
 6278
 6279                                sql_query_annotation_where_clause = """
 6280                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6281                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6282                                """
 6283
 6284                            # Annotation with variants database
 6285                            else:
 6286                                sql_query_annotation_from_clause = f"""
 6287                                    FROM {parquet_file_link} as table_parquet
 6288                                """
 6289                                sql_query_annotation_where_clause = f"""
 6290                                    table_variants."#CHROM" = '{chrom}'
 6291                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6292                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6293                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6294                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6295                                """
 6296
 6297                            # Create update query
 6298                            sql_query_annotation_chrom_interval_pos = f"""
 6299                                UPDATE {table_variants} as table_variants
 6300                                    SET INFO = 
 6301                                        concat(
 6302                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6303                                                THEN table_variants.INFO
 6304                                                ELSE ''
 6305                                            END
 6306                                            ,
 6307                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6308                                                        AND (
 6309                                                        concat({sql_query_annotation_update_info_sets_sql})
 6310                                                        )
 6311                                                        NOT IN ('','.') 
 6312                                                    THEN ';'
 6313                                                    ELSE ''
 6314                                            END
 6315                                            ,
 6316                                            {sql_query_annotation_update_info_sets_sql}
 6317                                            )
 6318                                    {sql_query_annotation_from_clause}
 6319                                    WHERE {sql_query_annotation_where_clause}
 6320                                    ;
 6321                                """
 6322
 6323                            # Add update query to dict
 6324                            query_dict[
 6325                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6326                            ] = sql_query_annotation_chrom_interval_pos
 6327
 6328                        nb_of_query = len(query_dict)
 6329                        num_query = 0
 6330
 6331                        # SET max_expression_depth TO x
 6332                        self.conn.execute("SET max_expression_depth TO 10000")
 6333
 6334                        for query_name in query_dict:
 6335                            query = query_dict[query_name]
 6336                            num_query += 1
 6337                            log.info(
 6338                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6339                            )
 6340                            result = self.conn.execute(query)
 6341                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6342                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6343                            log.info(
 6344                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6345                            )
 6346
 6347                        log.info(
 6348                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6349                        )
 6350
 6351                    else:
 6352
 6353                        log.info(
 6354                            f"Annotation '{annotation_name}' - No Annotations available"
 6355                        )
 6356
 6357                    log.debug("Final header: " + str(vcf_reader.infos))
 6358
 6359        # Remove added columns
 6360        for added_column in added_columns:
 6361            self.drop_column(column=added_column)
 6362
 6363    def annotation_splice(self, threads: int = None) -> None:
 6364        """
 6365        This function annotate with snpEff
 6366
 6367        :param threads: The number of threads to use
 6368        :return: the value of the variable "return_value".
 6369        """
 6370
 6371        # DEBUG
 6372        log.debug("Start annotation with splice tools")
 6373
 6374        # Threads
 6375        if not threads:
 6376            threads = self.get_threads()
 6377        log.debug("Threads: " + str(threads))
 6378
 6379        # DEBUG
 6380        delete_tmp = True
 6381        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6382            delete_tmp = False
 6383            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6384
 6385        # Config
 6386        config = self.get_config()
 6387        log.debug("Config: " + str(config))
 6388        splice_config = config.get("tools", {}).get("splice", {})
 6389        if not splice_config:
 6390            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6391            msg_err = "No Splice tool config"
 6392            raise ValueError(msg_err)
 6393        log.debug(f"splice_config: {splice_config}")
 6394
 6395        # Config - Folders - Databases
 6396        databases_folders = (
 6397            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6398        )
 6399        log.debug("Databases annotations: " + str(databases_folders))
 6400
 6401        # Splice docker image
 6402        splice_docker_image = splice_config.get("docker").get("image")
 6403
 6404        # Pull splice image if it's not already there
 6405        if not check_docker_image_exists(splice_docker_image):
 6406            log.warning(
 6407                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6408            )
 6409            try:
 6410                command(f"docker pull {splice_config.get('docker').get('image')}")
 6411            except subprocess.CalledProcessError:
 6412                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6413                log.error(msg_err)
 6414                raise ValueError(msg_err)
 6415
 6416        # Config - splice databases
 6417        splice_databases = (
 6418            config.get("folders", {})
 6419            .get("databases", {})
 6420            .get("splice", DEFAULT_SPLICE_FOLDER)
 6421        )
 6422        splice_databases = full_path(splice_databases)
 6423
 6424        # Param
 6425        param = self.get_param()
 6426        log.debug("Param: " + str(param))
 6427
 6428        # Param
 6429        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6430        log.debug("Options: " + str(options))
 6431
 6432        # Data
 6433        table_variants = self.get_table_variants()
 6434
 6435        # Check if not empty
 6436        log.debug("Check if not empty")
 6437        sql_query_chromosomes = (
 6438            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6439        )
 6440        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6441            log.info("VCF empty")
 6442            return None
 6443
 6444        # Export in VCF
 6445        log.debug("Create initial file to annotate")
 6446
 6447        # Create output folder / work folder
 6448        if options.get("output_folder", ""):
 6449            output_folder = options.get("output_folder", "")
 6450            if not os.path.exists(output_folder):
 6451                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6452        else:
 6453            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6454            if not os.path.exists(output_folder):
 6455                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6456
 6457        if options.get("workdir", ""):
 6458            workdir = options.get("workdir", "")
 6459        else:
 6460            workdir = "/work"
 6461
 6462        # Create tmp VCF file
 6463        tmp_vcf = NamedTemporaryFile(
 6464            prefix=self.get_prefix(),
 6465            dir=output_folder,
 6466            suffix=".vcf",
 6467            delete=False,
 6468        )
 6469        tmp_vcf_name = tmp_vcf.name
 6470
 6471        # VCF header
 6472        header = self.get_header()
 6473
 6474        # Existing annotations
 6475        for vcf_annotation in self.get_header().infos:
 6476
 6477            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6478            log.debug(
 6479                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6480            )
 6481
 6482        # Memory limit
 6483        if config.get("memory", None):
 6484            memory_limit = config.get("memory", "8G").upper()
 6485            # upper()
 6486        else:
 6487            memory_limit = "8G"
 6488        log.debug(f"memory_limit: {memory_limit}")
 6489
 6490        # Check number of variants to annotate
 6491        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6492        where_clause_regex_spip = r"SPiP_\w+"
 6493        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6494        df_list_of_variants_to_annotate = self.get_query_to_df(
 6495            query=f""" SELECT * FROM variants {where_clause} """
 6496        )
 6497        if len(df_list_of_variants_to_annotate) == 0:
 6498            log.warning(
 6499                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6500            )
 6501            return None
 6502        else:
 6503            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6504
 6505        # Export VCF file
 6506        self.export_variant_vcf(
 6507            vcf_file=tmp_vcf_name,
 6508            remove_info=True,
 6509            add_samples=True,
 6510            index=False,
 6511            where_clause=where_clause,
 6512        )
 6513        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6514        if any(value for value in splice_config.values() if value is None):
 6515            log.warning("At least one splice config parameter is empty")
 6516            # exit annotation_splice
 6517            return None
 6518
 6519        # Params in splice nf
 6520        def check_values(dico: dict):
 6521            """
 6522            Ensure parameters for NF splice pipeline
 6523            """
 6524            for key, val in dico.items():
 6525                if key == "genome":
 6526                    if any(
 6527                        assemb in options.get("genome", {})
 6528                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6529                    ):
 6530                        yield f"--{key} hg19"
 6531                    elif any(
 6532                        assemb in options.get("genome", {})
 6533                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6534                    ):
 6535                        yield f"--{key} hg38"
 6536                elif (
 6537                    (isinstance(val, str) and val)
 6538                    or isinstance(val, int)
 6539                    or isinstance(val, bool)
 6540                ):
 6541                    yield f"--{key} {val}"
 6542
 6543        # Genome
 6544        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6545        options["genome"] = genome
 6546        # NF params
 6547        nf_params = []
 6548        # Add options
 6549        if options:
 6550            log.debug(options)
 6551            nf_params = list(check_values(options))
 6552            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6553        else:
 6554            log.debug("No NF params provided")
 6555        # Add threads
 6556        if "threads" not in options.keys():
 6557            nf_params.append(f"--threads {threads}")
 6558        # Genome path
 6559        genome_path = find_genome(
 6560            config.get("folders", {})
 6561            .get("databases", {})
 6562            .get("genomes", DEFAULT_GENOME_FOLDER),
 6563            file=f"{genome}.fa",
 6564        )
 6565        # Add genome path
 6566        if not genome_path:
 6567            raise ValueError(
 6568                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6569            )
 6570        else:
 6571            log.debug(f"Genome: {genome_path}")
 6572            nf_params.append(f"--genome_path {genome_path}")
 6573
 6574        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6575            """
 6576            Setting up updated databases for SPiP and SpliceAI
 6577            """
 6578
 6579            try:
 6580
 6581                # SpliceAI assembly transcriptome
 6582                spliceai_assembly = os.path.join(
 6583                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6584                    options.get("genome"),
 6585                    "transcriptome",
 6586                )
 6587                spip_assembly = options.get("genome")
 6588
 6589                spip = find(
 6590                    f"transcriptome_{spip_assembly}.RData",
 6591                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6592                )
 6593                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6594                log.debug(f"SPiP annotations: {spip}")
 6595                log.debug(f"SpliceAI annotations: {spliceai}")
 6596                if spip and spliceai:
 6597                    return [
 6598                        f"--spip_transcriptome {spip}",
 6599                        f"--spliceai_transcriptome {spliceai}",
 6600                    ]
 6601                else:
 6602                    log.warning(
 6603                        "Can't find splice databases in configuration, use annotations file from image"
 6604                    )
 6605            except TypeError:
 6606                log.warning(
 6607                    "Can't find splice databases in configuration, use annotations file from image"
 6608                )
 6609                return []
 6610
 6611        # Add options, check if transcriptome option have already beend provided
 6612        if (
 6613            "spip_transcriptome" not in nf_params
 6614            and "spliceai_transcriptome" not in nf_params
 6615        ):
 6616            splice_reference = splice_annotations(options, config)
 6617            if splice_reference:
 6618                nf_params.extend(splice_reference)
 6619        # nf_params.append(f"--output_folder {output_folder}")
 6620        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6621        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6622        log.debug(cmd)
 6623        splice_config["docker"]["command"] = cmd
 6624
 6625        # Ensure proxy is set
 6626        proxy = [
 6627            f"-e {var}={os.getenv(var)}"
 6628            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6629            if os.getenv(var) is not None
 6630        ]
 6631        docker_cmd = get_bin_command(
 6632            tool="splice",
 6633            bin_type="docker",
 6634            config=config,
 6635            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6636            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6637        )
 6638        # print(docker_cmd)
 6639        # exit()
 6640        # Docker debug
 6641        # if splice_config.get("rm_container"):
 6642        #     rm_container = "--rm"
 6643        # else:
 6644        #     rm_container = ""
 6645        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6646        log.debug(docker_cmd)
 6647        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6648        log.debug(res.stdout)
 6649        if res.stderr:
 6650            log.error(res.stderr)
 6651        res.check_returncode()
 6652        # Update variants
 6653        log.info("Annotation - Updating...")
 6654        # Test find output vcf
 6655        log.debug(
 6656            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6657        )
 6658        output_vcf = []
 6659        # Wrong folder to look in
 6660        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6661            if (
 6662                files
 6663                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6664            ):
 6665                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6666        # log.debug(os.listdir(options.get("output_folder")))
 6667        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6668        if not output_vcf:
 6669            log.debug(
 6670                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6671            )
 6672        else:
 6673            # Get new header from annotated vcf
 6674            log.debug(f"Initial header: {len(header.infos)} fields")
 6675            # Create new header with splice infos
 6676            new_vcf = Variants(input=output_vcf[0])
 6677            new_vcf_header = new_vcf.get_header().infos
 6678            for keys, infos in new_vcf_header.items():
 6679                if keys not in header.infos.keys():
 6680                    header.infos[keys] = infos
 6681            log.debug(f"New header: {len(header.infos)} fields")
 6682            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6683            self.update_from_vcf(output_vcf[0])
 6684
 6685        # Remove file
 6686        remove_if_exists(output_vcf)
 6687
 6688    ###
 6689    # Prioritization
 6690    ###
 6691
 6692    def get_config_default(self, name: str) -> dict:
 6693        """
 6694        The function `get_config_default` returns a dictionary containing default configurations for
 6695        various calculations and prioritizations.
 6696
 6697        :param name: The `get_config_default` function returns a dictionary containing default
 6698        configurations for different calculations and prioritizations. The `name` parameter is used to
 6699        specify which specific configuration to retrieve from the dictionary
 6700        :type name: str
 6701        :return: The function `get_config_default` returns a dictionary containing default configuration
 6702        settings for different calculations and prioritizations. The specific configuration settings are
 6703        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6704        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6705        returned. If there is no match, an empty dictionary is returned.
 6706        """
 6707
 6708        config_default = {
 6709            "calculations": {
 6710                "variant_chr_pos_alt_ref": {
 6711                    "type": "sql",
 6712                    "name": "variant_chr_pos_alt_ref",
 6713                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6714                    "available": False,
 6715                    "output_column_name": "variant_chr_pos_alt_ref",
 6716                    "output_column_type": "String",
 6717                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6718                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6719                    "operation_info": True,
 6720                },
 6721                "VARTYPE": {
 6722                    "type": "sql",
 6723                    "name": "VARTYPE",
 6724                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6725                    "available": True,
 6726                    "output_column_name": "VARTYPE",
 6727                    "output_column_type": "String",
 6728                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6729                    "operation_query": """
 6730                            CASE
 6731                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6732                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6733                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6734                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6735                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6736                                ELSE 'UNDEFINED'
 6737                            END
 6738                            """,
 6739                    "info_fields": ["SVTYPE"],
 6740                    "operation_info": True,
 6741                },
 6742                "snpeff_hgvs": {
 6743                    "type": "python",
 6744                    "name": "snpeff_hgvs",
 6745                    "description": "HGVS nomenclatures from snpEff annotation",
 6746                    "available": True,
 6747                    "function_name": "calculation_extract_snpeff_hgvs",
 6748                    "function_params": ["snpeff_hgvs", "ANN"],
 6749                },
 6750                "snpeff_ann_explode": {
 6751                    "type": "python",
 6752                    "name": "snpeff_ann_explode",
 6753                    "description": "Explode snpEff annotations with uniquify values",
 6754                    "available": True,
 6755                    "function_name": "calculation_snpeff_ann_explode",
 6756                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6757                },
 6758                "snpeff_ann_explode_uniquify": {
 6759                    "type": "python",
 6760                    "name": "snpeff_ann_explode_uniquify",
 6761                    "description": "Explode snpEff annotations",
 6762                    "available": True,
 6763                    "function_name": "calculation_snpeff_ann_explode",
 6764                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6765                },
 6766                "snpeff_ann_explode_json": {
 6767                    "type": "python",
 6768                    "name": "snpeff_ann_explode_json",
 6769                    "description": "Explode snpEff annotations in JSON format",
 6770                    "available": True,
 6771                    "function_name": "calculation_snpeff_ann_explode",
 6772                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6773                },
 6774                "NOMEN": {
 6775                    "type": "python",
 6776                    "name": "NOMEN",
 6777                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6778                    "available": True,
 6779                    "function_name": "calculation_extract_nomen",
 6780                    "function_params": [],
 6781                },
 6782                "FINDBYPIPELINE": {
 6783                    "type": "python",
 6784                    "name": "FINDBYPIPELINE",
 6785                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6786                    "available": True,
 6787                    "function_name": "calculation_find_by_pipeline",
 6788                    "function_params": ["findbypipeline"],
 6789                },
 6790                "FINDBYSAMPLE": {
 6791                    "type": "python",
 6792                    "name": "FINDBYSAMPLE",
 6793                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6794                    "available": True,
 6795                    "function_name": "calculation_find_by_pipeline",
 6796                    "function_params": ["findbysample"],
 6797                },
 6798                "GENOTYPECONCORDANCE": {
 6799                    "type": "python",
 6800                    "name": "GENOTYPECONCORDANCE",
 6801                    "description": "Concordance of genotype for multi caller VCF",
 6802                    "available": True,
 6803                    "function_name": "calculation_genotype_concordance",
 6804                    "function_params": [],
 6805                },
 6806                "BARCODE": {
 6807                    "type": "python",
 6808                    "name": "BARCODE",
 6809                    "description": "BARCODE as VaRank tool",
 6810                    "available": True,
 6811                    "function_name": "calculation_barcode",
 6812                    "function_params": [],
 6813                },
 6814                "BARCODEFAMILY": {
 6815                    "type": "python",
 6816                    "name": "BARCODEFAMILY",
 6817                    "description": "BARCODEFAMILY as VaRank tool",
 6818                    "available": True,
 6819                    "function_name": "calculation_barcode_family",
 6820                    "function_params": ["BCF"],
 6821                },
 6822                "TRIO": {
 6823                    "type": "python",
 6824                    "name": "TRIO",
 6825                    "description": "Inheritance for a trio family",
 6826                    "available": True,
 6827                    "function_name": "calculation_trio",
 6828                    "function_params": [],
 6829                },
 6830                "VAF": {
 6831                    "type": "python",
 6832                    "name": "VAF",
 6833                    "description": "Variant Allele Frequency (VAF) harmonization",
 6834                    "available": True,
 6835                    "function_name": "calculation_vaf_normalization",
 6836                    "function_params": [],
 6837                },
 6838                "VAF_stats": {
 6839                    "type": "python",
 6840                    "name": "VAF_stats",
 6841                    "description": "Variant Allele Frequency (VAF) statistics",
 6842                    "available": True,
 6843                    "function_name": "calculation_genotype_stats",
 6844                    "function_params": ["VAF"],
 6845                },
 6846                "DP_stats": {
 6847                    "type": "python",
 6848                    "name": "DP_stats",
 6849                    "description": "Depth (DP) statistics",
 6850                    "available": True,
 6851                    "function_name": "calculation_genotype_stats",
 6852                    "function_params": ["DP"],
 6853                },
 6854                "variant_id": {
 6855                    "type": "python",
 6856                    "name": "variant_id",
 6857                    "description": "Variant ID generated from variant position and type",
 6858                    "available": True,
 6859                    "function_name": "calculation_variant_id",
 6860                    "function_params": [],
 6861                },
 6862                "transcripts_json": {
 6863                    "type": "python",
 6864                    "name": "transcripts_json",
 6865                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6866                    "available": True,
 6867                    "function_name": "calculation_transcripts_annotation",
 6868                    "function_params": ["transcripts_json", None],
 6869                },
 6870                "transcripts_ann": {
 6871                    "type": "python",
 6872                    "name": "transcripts_ann",
 6873                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6874                    "available": True,
 6875                    "function_name": "calculation_transcripts_annotation",
 6876                    "function_params": [None, "transcripts_ann"],
 6877                },
 6878                "transcripts_annotations": {
 6879                    "type": "python",
 6880                    "name": "transcripts_annotations",
 6881                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6882                    "available": True,
 6883                    "function_name": "calculation_transcripts_annotation",
 6884                    "function_params": [None, None],
 6885                },
 6886                "transcripts_prioritization": {
 6887                    "type": "python",
 6888                    "name": "transcripts_prioritization",
 6889                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6890                    "available": True,
 6891                    "function_name": "calculation_transcripts_prioritization",
 6892                    "function_params": [],
 6893                },
 6894                "transcripts_export": {
 6895                    "type": "python",
 6896                    "name": "transcripts_export",
 6897                    "description": "Export transcripts table/view as a file (using param.json)",
 6898                    "available": True,
 6899                    "function_name": "calculation_transcripts_export",
 6900                    "function_params": [],
 6901                },
 6902            },
 6903            "prioritizations": {
 6904                "default": {
 6905                    "ANN2": [
 6906                        {
 6907                            "type": "contains",
 6908                            "value": "HIGH",
 6909                            "score": 5,
 6910                            "flag": "PASS",
 6911                            "comment": [
 6912                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6913                            ],
 6914                        },
 6915                        {
 6916                            "type": "contains",
 6917                            "value": "MODERATE",
 6918                            "score": 3,
 6919                            "flag": "PASS",
 6920                            "comment": [
 6921                                "A non-disruptive variant that might change protein effectiveness"
 6922                            ],
 6923                        },
 6924                        {
 6925                            "type": "contains",
 6926                            "value": "LOW",
 6927                            "score": 0,
 6928                            "flag": "FILTERED",
 6929                            "comment": [
 6930                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6931                            ],
 6932                        },
 6933                        {
 6934                            "type": "contains",
 6935                            "value": "MODIFIER",
 6936                            "score": 0,
 6937                            "flag": "FILTERED",
 6938                            "comment": [
 6939                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6940                            ],
 6941                        },
 6942                    ],
 6943                }
 6944            },
 6945        }
 6946
 6947        return config_default.get(name, None)
 6948
 6949    def get_config_json(
 6950        self, name: str, config_dict: dict = {}, config_file: str = None
 6951    ) -> dict:
 6952        """
 6953        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6954        default values, a dictionary, and a file.
 6955
 6956        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6957        the name of the configuration. It is used to identify and retrieve the configuration settings
 6958        for a specific component or module
 6959        :type name: str
 6960        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6961        dictionary that allows you to provide additional configuration settings or overrides. When you
 6962        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6963        the key is the configuration setting you want to override or
 6964        :type config_dict: dict
 6965        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6966        specify the path to a configuration file that contains additional settings. If provided, the
 6967        function will read the contents of this file and update the configuration dictionary with the
 6968        values found in the file, overriding any existing values with the
 6969        :type config_file: str
 6970        :return: The function `get_config_json` returns a dictionary containing the configuration
 6971        settings.
 6972        """
 6973
 6974        # Create with default prioritizations
 6975        config_default = self.get_config_default(name=name)
 6976        configuration = config_default
 6977        # log.debug(f"configuration={configuration}")
 6978
 6979        # Replace prioritizations from dict
 6980        for config in config_dict:
 6981            configuration[config] = config_dict[config]
 6982
 6983        # Replace prioritizations from file
 6984        config_file = full_path(config_file)
 6985        if config_file:
 6986            if os.path.exists(config_file):
 6987                with open(config_file) as config_file_content:
 6988                    config_file_dict = json.load(config_file_content)
 6989                for config in config_file_dict:
 6990                    configuration[config] = config_file_dict[config]
 6991            else:
 6992                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6993                log.error(msg_error)
 6994                raise ValueError(msg_error)
 6995
 6996        return configuration
 6997
 6998    def prioritization(
 6999        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7000    ) -> bool:
 7001        """
 7002        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7003        prioritizes variants based on configured profiles and criteria.
 7004
 7005        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7006        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7007        a table name is provided, the method will prioritize the variants in that specific table
 7008        :type table: str
 7009        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7010        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7011        provided, the code will use a default prefix value of "PZ"
 7012        :type pz_prefix: str
 7013        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7014        additional parameters specific to the prioritization process. These parameters can include
 7015        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7016        configurations needed for the prioritization of variants in a V
 7017        :type pz_param: dict
 7018        :return: A boolean value (True) is being returned from the `prioritization` function.
 7019        """
 7020
 7021        # Config
 7022        config = self.get_config()
 7023
 7024        # Param
 7025        param = self.get_param()
 7026
 7027        # Prioritization param
 7028        if pz_param is not None:
 7029            prioritization_param = pz_param
 7030        else:
 7031            prioritization_param = param.get("prioritization", {})
 7032
 7033        # Configuration profiles
 7034        prioritization_config_file = prioritization_param.get(
 7035            "prioritization_config", None
 7036        )
 7037        prioritization_config_file = full_path(prioritization_config_file)
 7038        prioritizations_config = self.get_config_json(
 7039            name="prioritizations", config_file=prioritization_config_file
 7040        )
 7041
 7042        # Prioritization prefix
 7043        pz_prefix_default = "PZ"
 7044        if pz_prefix is None:
 7045            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7046
 7047        # Prioritization options
 7048        profiles = prioritization_param.get("profiles", [])
 7049        if isinstance(profiles, str):
 7050            profiles = profiles.split(",")
 7051        pzfields = prioritization_param.get(
 7052            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7053        )
 7054        if isinstance(pzfields, str):
 7055            pzfields = pzfields.split(",")
 7056        default_profile = prioritization_param.get("default_profile", None)
 7057        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7058        prioritization_score_mode = prioritization_param.get(
 7059            "prioritization_score_mode", "HOWARD"
 7060        )
 7061
 7062        # Quick Prioritizations
 7063        prioritizations = param.get("prioritizations", None)
 7064        if prioritizations:
 7065            log.info("Quick Prioritization:")
 7066            for profile in prioritizations.split(","):
 7067                if profile not in profiles:
 7068                    profiles.append(profile)
 7069                    log.info(f"   {profile}")
 7070
 7071        # If profile "ALL" provided, all profiles in the config profiles
 7072        if "ALL" in profiles:
 7073            profiles = list(prioritizations_config.keys())
 7074
 7075        for profile in profiles:
 7076            if prioritizations_config.get(profile, None):
 7077                log.debug(f"Profile '{profile}' configured")
 7078            else:
 7079                msg_error = f"Profile '{profile}' NOT configured"
 7080                log.error(msg_error)
 7081                raise ValueError(msg_error)
 7082
 7083        if profiles:
 7084            log.info(f"Prioritization... ")
 7085        else:
 7086            log.debug(f"No profile defined")
 7087            return False
 7088
 7089        if not default_profile and len(profiles):
 7090            default_profile = profiles[0]
 7091
 7092        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7093        log.debug("Profiles to check: " + str(list(profiles)))
 7094
 7095        # Variables
 7096        if table is not None:
 7097            table_variants = table
 7098        else:
 7099            table_variants = self.get_table_variants(clause="update")
 7100        log.debug(f"Table to prioritize: {table_variants}")
 7101
 7102        # Added columns
 7103        added_columns = []
 7104
 7105        # Create list of PZfields
 7106        # List of PZFields
 7107        list_of_pzfields_original = pzfields + [
 7108            pzfield + pzfields_sep + profile
 7109            for pzfield in pzfields
 7110            for profile in profiles
 7111        ]
 7112        list_of_pzfields = []
 7113        log.debug(f"{list_of_pzfields_original}")
 7114
 7115        # Remove existing PZfields to use if exists
 7116        for pzfield in list_of_pzfields_original:
 7117            if self.get_header().infos.get(pzfield, None) is None:
 7118                list_of_pzfields.append(pzfield)
 7119                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7120            else:
 7121                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7122
 7123        if list_of_pzfields:
 7124
 7125            # Explode Infos prefix
 7126            explode_infos_prefix = self.get_explode_infos_prefix()
 7127
 7128            # PZfields tags description
 7129            PZfields_INFOS = {
 7130                f"{pz_prefix}Tags": {
 7131                    "ID": f"{pz_prefix}Tags",
 7132                    "Number": ".",
 7133                    "Type": "String",
 7134                    "Description": "Variant tags based on annotation criteria",
 7135                },
 7136                f"{pz_prefix}Score": {
 7137                    "ID": f"{pz_prefix}Score",
 7138                    "Number": 1,
 7139                    "Type": "Integer",
 7140                    "Description": "Variant score based on annotation criteria",
 7141                },
 7142                f"{pz_prefix}Flag": {
 7143                    "ID": f"{pz_prefix}Flag",
 7144                    "Number": 1,
 7145                    "Type": "String",
 7146                    "Description": "Variant flag based on annotation criteria",
 7147                },
 7148                f"{pz_prefix}Comment": {
 7149                    "ID": f"{pz_prefix}Comment",
 7150                    "Number": ".",
 7151                    "Type": "String",
 7152                    "Description": "Variant comment based on annotation criteria",
 7153                },
 7154                f"{pz_prefix}Infos": {
 7155                    "ID": f"{pz_prefix}Infos",
 7156                    "Number": ".",
 7157                    "Type": "String",
 7158                    "Description": "Variant infos based on annotation criteria",
 7159                },
 7160                f"{pz_prefix}Class": {
 7161                    "ID": f"{pz_prefix}Class",
 7162                    "Number": ".",
 7163                    "Type": "String",
 7164                    "Description": "Variant class based on annotation criteria",
 7165                },
 7166            }
 7167
 7168            # Create INFO fields if not exist
 7169            for field in PZfields_INFOS:
 7170                field_ID = PZfields_INFOS[field]["ID"]
 7171                field_description = PZfields_INFOS[field]["Description"]
 7172                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7173                    field_description = (
 7174                        PZfields_INFOS[field]["Description"]
 7175                        + f", profile {default_profile}"
 7176                    )
 7177                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7178                        field_ID,
 7179                        PZfields_INFOS[field]["Number"],
 7180                        PZfields_INFOS[field]["Type"],
 7181                        field_description,
 7182                        "unknown",
 7183                        "unknown",
 7184                        code_type_map[PZfields_INFOS[field]["Type"]],
 7185                    )
 7186
 7187            # Create INFO fields if not exist for each profile
 7188            for profile in prioritizations_config:
 7189                if profile in profiles or profiles == []:
 7190                    for field in PZfields_INFOS:
 7191                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7192                        field_description = (
 7193                            PZfields_INFOS[field]["Description"]
 7194                            + f", profile {profile}"
 7195                        )
 7196                        if (
 7197                            field_ID not in self.get_header().infos
 7198                            and field in pzfields
 7199                        ):
 7200                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7201                                field_ID,
 7202                                PZfields_INFOS[field]["Number"],
 7203                                PZfields_INFOS[field]["Type"],
 7204                                field_description,
 7205                                "unknown",
 7206                                "unknown",
 7207                                code_type_map[PZfields_INFOS[field]["Type"]],
 7208                            )
 7209
 7210            # Header
 7211            for pzfield in list_of_pzfields:
 7212                if re.match(f"{pz_prefix}Score.*", pzfield):
 7213                    added_column = self.add_column(
 7214                        table_name=table_variants,
 7215                        column_name=pzfield,
 7216                        column_type="INTEGER",
 7217                        default_value="0",
 7218                    )
 7219                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7220                    added_column = self.add_column(
 7221                        table_name=table_variants,
 7222                        column_name=pzfield,
 7223                        column_type="BOOLEAN",
 7224                        default_value="1",
 7225                    )
 7226                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7227                    added_column = self.add_column(
 7228                        table_name=table_variants,
 7229                        column_name=pzfield,
 7230                        column_type="VARCHAR[]",
 7231                        default_value="null",
 7232                    )
 7233                else:
 7234                    added_column = self.add_column(
 7235                        table_name=table_variants,
 7236                        column_name=pzfield,
 7237                        column_type="STRING",
 7238                        default_value="''",
 7239                    )
 7240                added_columns.append(added_column)
 7241
 7242            # Profiles
 7243            if profiles:
 7244
 7245                # foreach profile in configuration file
 7246                for profile in prioritizations_config:
 7247
 7248                    # If profile is asked in param, or ALL are asked (empty profile [])
 7249                    if profile in profiles or profiles == []:
 7250                        log.info(f"Profile '{profile}'")
 7251
 7252                        sql_set_info_option = ""
 7253
 7254                        sql_set_info = []
 7255
 7256                        # PZ fields set
 7257
 7258                        # PZScore
 7259                        if (
 7260                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7261                            in list_of_pzfields
 7262                        ):
 7263                            sql_set_info.append(
 7264                                f"""
 7265                                    concat(
 7266                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7267                                        {pz_prefix}Score{pzfields_sep}{profile}
 7268                                    ) 
 7269                                """
 7270                            )
 7271                            if (
 7272                                profile == default_profile
 7273                                and f"{pz_prefix}Score" in list_of_pzfields
 7274                            ):
 7275                                sql_set_info.append(
 7276                                    f"""
 7277                                        concat(
 7278                                            '{pz_prefix}Score=',
 7279                                            {pz_prefix}Score{pzfields_sep}{profile}
 7280                                        )
 7281                                    """
 7282                                )
 7283
 7284                        # PZFlag
 7285                        if (
 7286                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7287                            in list_of_pzfields
 7288                        ):
 7289                            sql_set_info.append(
 7290                                f"""
 7291                                    concat(
 7292                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7293                                        CASE 
 7294                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7295                                            THEN 'PASS'
 7296                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7297                                            THEN 'FILTERED'
 7298                                        END
 7299                                    ) 
 7300                                """
 7301                            )
 7302                            if (
 7303                                profile == default_profile
 7304                                and f"{pz_prefix}Flag" in list_of_pzfields
 7305                            ):
 7306                                sql_set_info.append(
 7307                                    f"""
 7308                                        concat(
 7309                                            '{pz_prefix}Flag=',
 7310                                            CASE 
 7311                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7312                                                THEN 'PASS'
 7313                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7314                                                THEN 'FILTERED'
 7315                                            END
 7316                                        )
 7317                                    """
 7318                                )
 7319
 7320                        # PZClass
 7321                        if (
 7322                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7323                            in list_of_pzfields
 7324                        ):
 7325                            sql_set_info.append(
 7326                                f"""
 7327                                    concat(
 7328                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7329                                        CASE
 7330                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7331                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7332                                            ELSE '.'
 7333                                        END 
 7334                                    )
 7335                                    
 7336                                """
 7337                            )
 7338                            if (
 7339                                profile == default_profile
 7340                                and f"{pz_prefix}Class" in list_of_pzfields
 7341                            ):
 7342                                sql_set_info.append(
 7343                                    f"""
 7344                                        concat(
 7345                                            '{pz_prefix}Class=',
 7346                                            CASE
 7347                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7348                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7349                                                ELSE '.'
 7350                                            END 
 7351                                        )
 7352                                    """
 7353                                )
 7354
 7355                        # PZComment
 7356                        if (
 7357                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7358                            in list_of_pzfields
 7359                        ):
 7360                            sql_set_info.append(
 7361                                f"""
 7362                                    CASE
 7363                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7364                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7365                                        ELSE ''
 7366                                    END
 7367                                """
 7368                            )
 7369                            if (
 7370                                profile == default_profile
 7371                                and f"{pz_prefix}Comment" in list_of_pzfields
 7372                            ):
 7373                                sql_set_info.append(
 7374                                    f"""
 7375                                        CASE
 7376                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7377                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7378                                            ELSE ''
 7379                                        END
 7380                                    """
 7381                                )
 7382
 7383                        # PZInfos
 7384                        if (
 7385                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7386                            in list_of_pzfields
 7387                        ):
 7388                            sql_set_info.append(
 7389                                f"""
 7390                                    CASE
 7391                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7392                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7393                                        ELSE ''
 7394                                    END
 7395                                """
 7396                            )
 7397                            if (
 7398                                profile == default_profile
 7399                                and f"{pz_prefix}Infos" in list_of_pzfields
 7400                            ):
 7401                                sql_set_info.append(
 7402                                    f"""
 7403                                        CASE
 7404                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7405                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7406                                            ELSE ''
 7407                                        END
 7408                                    """
 7409                                )
 7410
 7411                        # Merge PZfields
 7412                        sql_set_info_option = ""
 7413                        sql_set_sep = ""
 7414                        for sql_set in sql_set_info:
 7415                            if sql_set_sep:
 7416                                sql_set_info_option += f"""
 7417                                    , concat('{sql_set_sep}', {sql_set})
 7418                                """
 7419                            else:
 7420                                sql_set_info_option += f"""
 7421                                    , {sql_set}
 7422                                """
 7423                            sql_set_sep = ";"
 7424
 7425                        sql_queries = []
 7426                        for annotation in prioritizations_config[profile]:
 7427
 7428                            # skip special sections
 7429                            if annotation.startswith("_"):
 7430                                continue
 7431
 7432                            # For each criterions
 7433                            for criterion in prioritizations_config[profile][
 7434                                annotation
 7435                            ]:
 7436
 7437                                # Criterion mode
 7438                                criterion_mode = None
 7439                                if np.any(
 7440                                    np.isin(list(criterion.keys()), ["type", "value"])
 7441                                ):
 7442                                    criterion_mode = "operation"
 7443                                elif np.any(
 7444                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7445                                ):
 7446                                    criterion_mode = "sql"
 7447                                log.debug(f"Criterion Mode: {criterion_mode}")
 7448
 7449                                # Criterion parameters
 7450                                criterion_type = criterion.get("type", None)
 7451                                criterion_value = criterion.get("value", None)
 7452                                criterion_sql = criterion.get("sql", None)
 7453                                criterion_fields = criterion.get("fields", None)
 7454                                criterion_score = criterion.get("score", 0)
 7455                                criterion_flag = criterion.get("flag", "PASS")
 7456                                criterion_class = criterion.get("class", None)
 7457                                criterion_flag_bool = criterion_flag == "PASS"
 7458                                criterion_comment = (
 7459                                    ", ".join(criterion.get("comment", []))
 7460                                    .replace("'", "''")
 7461                                    .replace(";", ",")
 7462                                    .replace("\t", " ")
 7463                                )
 7464                                criterion_infos = (
 7465                                    str(criterion)
 7466                                    .replace("'", "''")
 7467                                    .replace(";", ",")
 7468                                    .replace("\t", " ")
 7469                                )
 7470
 7471                                # SQL
 7472                                if criterion_sql is not None and isinstance(
 7473                                    criterion_sql, list
 7474                                ):
 7475                                    criterion_sql = " ".join(criterion_sql)
 7476
 7477                                # Fields and explode
 7478                                if criterion_fields is None:
 7479                                    criterion_fields = [annotation]
 7480                                if not isinstance(criterion_fields, list):
 7481                                    criterion_fields = str(criterion_fields).split(",")
 7482
 7483                                # Class
 7484                                if criterion_class is not None and not isinstance(
 7485                                    criterion_class, list
 7486                                ):
 7487                                    criterion_class = str(criterion_class).split(",")
 7488
 7489                                for annotation_field in criterion_fields:
 7490
 7491                                    # Explode specific annotation
 7492                                    log.debug(
 7493                                        f"Explode annotation '{annotation_field}'"
 7494                                    )
 7495                                    added_columns += self.explode_infos(
 7496                                        prefix=explode_infos_prefix,
 7497                                        fields=[annotation_field],
 7498                                        table=table_variants,
 7499                                    )
 7500                                    extra_infos = self.get_extra_infos(
 7501                                        table=table_variants
 7502                                    )
 7503
 7504                                    # Check if annotation field is present
 7505                                    if (
 7506                                        f"{explode_infos_prefix}{annotation_field}"
 7507                                        not in extra_infos
 7508                                    ):
 7509                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7510                                        log.error(msq_err)
 7511                                        raise ValueError(msq_err)
 7512                                    else:
 7513                                        log.debug(
 7514                                            f"Annotation '{annotation_field}' in data"
 7515                                        )
 7516
 7517                                sql_set = []
 7518                                sql_set_info = []
 7519
 7520                                # PZ fields set
 7521
 7522                                # PZScore
 7523                                if (
 7524                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7525                                    in list_of_pzfields
 7526                                ):
 7527                                    # if prioritization_score_mode == "HOWARD":
 7528                                    #     sql_set.append(
 7529                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7530                                    #     )
 7531                                    # VaRank prioritization score mode
 7532                                    if prioritization_score_mode == "VaRank":
 7533                                        sql_set.append(
 7534                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7535                                        )
 7536                                    # default HOWARD prioritization score mode
 7537                                    else:
 7538                                        sql_set.append(
 7539                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7540                                        )
 7541
 7542                                # PZFlag
 7543                                if (
 7544                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7545                                    in list_of_pzfields
 7546                                ):
 7547                                    sql_set.append(
 7548                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7549                                    )
 7550
 7551                                # PZClass
 7552                                if (
 7553                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7554                                    in list_of_pzfields
 7555                                    and criterion_class is not None
 7556                                ):
 7557                                    sql_set.append(
 7558                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7559                                    )
 7560
 7561                                # PZComment
 7562                                if (
 7563                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7564                                    in list_of_pzfields
 7565                                ):
 7566                                    sql_set.append(
 7567                                        f"""
 7568                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7569                                                concat(
 7570                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7571                                                    CASE 
 7572                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7573                                                        THEN ', '
 7574                                                        ELSE ''
 7575                                                    END,
 7576                                                    '{criterion_comment}'
 7577                                                )
 7578                                        """
 7579                                    )
 7580
 7581                                # PZInfos
 7582                                if (
 7583                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7584                                    in list_of_pzfields
 7585                                ):
 7586                                    sql_set.append(
 7587                                        f"""
 7588                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7589                                                concat(
 7590                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7591                                                    '{criterion_infos}'
 7592                                                )
 7593                                        """
 7594                                    )
 7595                                sql_set_option = ",".join(sql_set)
 7596
 7597                                # Criterion and comparison
 7598                                if sql_set_option:
 7599
 7600                                    if criterion_mode in ["operation"]:
 7601
 7602                                        try:
 7603                                            float(criterion_value)
 7604                                            sql_update = f"""
 7605                                                UPDATE {table_variants}
 7606                                                SET {sql_set_option}
 7607                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7608                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7609                                            """
 7610                                        except:
 7611                                            contains_option = ""
 7612                                            if criterion_type == "contains":
 7613                                                contains_option = ".*"
 7614                                            sql_update = f"""
 7615                                                UPDATE {table_variants}
 7616                                                SET {sql_set_option}
 7617                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7618                                            """
 7619                                        sql_queries.append(sql_update)
 7620
 7621                                    elif criterion_mode in ["sql"]:
 7622
 7623                                        sql_update = f"""
 7624                                            UPDATE {table_variants}
 7625                                            SET {sql_set_option}
 7626                                            WHERE {criterion_sql}
 7627                                        """
 7628                                        sql_queries.append(sql_update)
 7629
 7630                                    else:
 7631                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7632                                        log.error(msg_err)
 7633                                        raise ValueError(msg_err)
 7634
 7635                                else:
 7636                                    log.warning(
 7637                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7638                                    )
 7639
 7640                        # PZTags
 7641                        if (
 7642                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7643                            in list_of_pzfields
 7644                        ):
 7645
 7646                            # Create PZFalgs value
 7647                            pztags_value = ""
 7648                            pztags_sep_default = ","
 7649                            pztags_sep = ""
 7650                            for pzfield in pzfields:
 7651                                if pzfield not in [f"{pz_prefix}Tags"]:
 7652                                    if (
 7653                                        f"{pzfield}{pzfields_sep}{profile}"
 7654                                        in list_of_pzfields
 7655                                    ):
 7656                                        if pzfield in [f"{pz_prefix}Flag"]:
 7657                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7658                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7659                                                    THEN 'PASS'
 7660                                                    ELSE 'FILTERED'
 7661                                                END, '"""
 7662                                        elif pzfield in [f"{pz_prefix}Class"]:
 7663                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7664                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7665                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7666                                                    ELSE '.'
 7667                                                END, '"""
 7668                                        else:
 7669                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7670                                        pztags_sep = pztags_sep_default
 7671
 7672                            # Add Query update for PZFlags
 7673                            sql_update_pztags = f"""
 7674                                UPDATE {table_variants}
 7675                                SET INFO = concat(
 7676                                        INFO,
 7677                                        CASE WHEN INFO NOT in ('','.')
 7678                                                THEN ';'
 7679                                                ELSE ''
 7680                                        END,
 7681                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7682                                    )
 7683                                """
 7684                            sql_queries.append(sql_update_pztags)
 7685
 7686                            # Add Query update for PZFlags for default
 7687                            if profile == default_profile:
 7688                                sql_update_pztags_default = f"""
 7689                                UPDATE {table_variants}
 7690                                SET INFO = concat(
 7691                                        INFO,
 7692                                        ';',
 7693                                        '{pz_prefix}Tags={pztags_value}'
 7694                                    )
 7695                                """
 7696                                sql_queries.append(sql_update_pztags_default)
 7697
 7698                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7699
 7700                        if sql_queries:
 7701
 7702                            for sql_query in sql_queries:
 7703                                log.debug(
 7704                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7705                                )
 7706                                self.conn.execute(sql_query)
 7707
 7708                        log.info(f"""Profile '{profile}' - Update... """)
 7709                        sql_query_update = f"""
 7710                            UPDATE {table_variants}
 7711                            SET INFO =  
 7712                                concat(
 7713                                    CASE
 7714                                        WHEN INFO NOT IN ('','.')
 7715                                        THEN concat(INFO, ';')
 7716                                        ELSE ''
 7717                                    END
 7718                                    {sql_set_info_option}
 7719                                )
 7720                        """
 7721                        self.conn.execute(sql_query_update)
 7722
 7723        else:
 7724
 7725            log.warning(f"No profiles in parameters")
 7726
 7727        # Remove added columns
 7728        for added_column in added_columns:
 7729            self.drop_column(column=added_column)
 7730
 7731        # Explode INFOS fields into table fields
 7732        if self.get_explode_infos():
 7733            self.explode_infos(
 7734                prefix=self.get_explode_infos_prefix(),
 7735                fields=self.get_explode_infos_fields(),
 7736                force=True,
 7737            )
 7738
 7739        return True
 7740
 7741    ###
 7742    # HGVS
 7743    ###
 7744
 7745    def annotation_hgvs(self, threads: int = None) -> None:
 7746        """
 7747        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7748        coordinates and alleles.
 7749
 7750        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7751        threads to use for parallel processing. If no value is provided, it will default to the number
 7752        of threads obtained from the `get_threads()` method
 7753        :type threads: int
 7754        """
 7755
 7756        # Function for each partition of the Dask Dataframe
 7757        def partition_function(partition):
 7758            """
 7759            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7760            each row of a DataFrame called `partition`.
 7761
 7762            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7763            to be processed
 7764            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7765            the "partition" dataframe along the axis 1.
 7766            """
 7767            return partition.apply(annotation_hgvs_partition, axis=1)
 7768
 7769        def annotation_hgvs_partition(row) -> str:
 7770            """
 7771            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7772            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7773
 7774            :param row: A dictionary-like object that contains the values for the following keys:
 7775            :return: a string that contains the HGVS names associated with the given row of data.
 7776            """
 7777
 7778            chr = row["CHROM"]
 7779            pos = row["POS"]
 7780            ref = row["REF"]
 7781            alt = row["ALT"]
 7782
 7783            # Find list of associated transcripts
 7784            transcripts_list = list(
 7785                polars_conn.execute(
 7786                    f"""
 7787                SELECT transcript
 7788                FROM refseq_df
 7789                WHERE CHROM='{chr}'
 7790                AND POS={pos}
 7791            """
 7792                )["transcript"]
 7793            )
 7794
 7795            # Full HGVS annotation in list
 7796            hgvs_full_list = []
 7797
 7798            for transcript_name in transcripts_list:
 7799
 7800                # Transcript
 7801                transcript = get_transcript(
 7802                    transcripts=transcripts, transcript_name=transcript_name
 7803                )
 7804                # Exon
 7805                if use_exon:
 7806                    exon = transcript.find_exon_number(pos)
 7807                else:
 7808                    exon = None
 7809                # Protein
 7810                transcript_protein = None
 7811                if use_protein or add_protein or full_format:
 7812                    transcripts_protein = list(
 7813                        polars_conn.execute(
 7814                            f"""
 7815                        SELECT protein
 7816                        FROM refseqlink_df
 7817                        WHERE transcript='{transcript_name}'
 7818                        LIMIT 1
 7819                    """
 7820                        )["protein"]
 7821                    )
 7822                    if len(transcripts_protein):
 7823                        transcript_protein = transcripts_protein[0]
 7824
 7825                # HGVS name
 7826                hgvs_name = format_hgvs_name(
 7827                    chr,
 7828                    pos,
 7829                    ref,
 7830                    alt,
 7831                    genome=genome,
 7832                    transcript=transcript,
 7833                    transcript_protein=transcript_protein,
 7834                    exon=exon,
 7835                    use_gene=use_gene,
 7836                    use_protein=use_protein,
 7837                    full_format=full_format,
 7838                    use_version=use_version,
 7839                    codon_type=codon_type,
 7840                )
 7841                hgvs_full_list.append(hgvs_name)
 7842                if add_protein and not use_protein and not full_format:
 7843                    hgvs_name = format_hgvs_name(
 7844                        chr,
 7845                        pos,
 7846                        ref,
 7847                        alt,
 7848                        genome=genome,
 7849                        transcript=transcript,
 7850                        transcript_protein=transcript_protein,
 7851                        exon=exon,
 7852                        use_gene=use_gene,
 7853                        use_protein=True,
 7854                        full_format=False,
 7855                        use_version=use_version,
 7856                        codon_type=codon_type,
 7857                    )
 7858                    hgvs_full_list.append(hgvs_name)
 7859
 7860            # Create liste of HGVS annotations
 7861            hgvs_full = ",".join(hgvs_full_list)
 7862
 7863            return hgvs_full
 7864
 7865        # Polars connexion
 7866        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7867
 7868        # Config
 7869        config = self.get_config()
 7870
 7871        # Databases
 7872        # Genome
 7873        databases_genomes_folders = (
 7874            config.get("folders", {})
 7875            .get("databases", {})
 7876            .get("genomes", DEFAULT_GENOME_FOLDER)
 7877        )
 7878        databases_genome = (
 7879            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7880        )
 7881        # refseq database folder
 7882        databases_refseq_folders = (
 7883            config.get("folders", {})
 7884            .get("databases", {})
 7885            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7886        )
 7887        # refseq
 7888        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7889        # refSeqLink
 7890        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7891
 7892        # Param
 7893        param = self.get_param()
 7894
 7895        # Quick HGVS
 7896        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7897            log.info(f"Quick HGVS Annotation:")
 7898            if not param.get("hgvs", None):
 7899                param["hgvs"] = {}
 7900            for option in param.get("hgvs_options", "").split(","):
 7901                option_var_val = option.split("=")
 7902                option_var = option_var_val[0]
 7903                if len(option_var_val) > 1:
 7904                    option_val = option_var_val[1]
 7905                else:
 7906                    option_val = "True"
 7907                if option_val.upper() in ["TRUE"]:
 7908                    option_val = True
 7909                elif option_val.upper() in ["FALSE"]:
 7910                    option_val = False
 7911                log.info(f"   {option_var}={option_val}")
 7912                param["hgvs"][option_var] = option_val
 7913
 7914        # Check if HGVS annotation enabled
 7915        if "hgvs" in param:
 7916            log.info(f"HGVS Annotation... ")
 7917            for hgvs_option in param.get("hgvs", {}):
 7918                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7919        else:
 7920            return
 7921
 7922        # HGVS Param
 7923        param_hgvs = param.get("hgvs", {})
 7924        use_exon = param_hgvs.get("use_exon", False)
 7925        use_gene = param_hgvs.get("use_gene", False)
 7926        use_protein = param_hgvs.get("use_protein", False)
 7927        add_protein = param_hgvs.get("add_protein", False)
 7928        full_format = param_hgvs.get("full_format", False)
 7929        use_version = param_hgvs.get("use_version", False)
 7930        codon_type = param_hgvs.get("codon_type", "3")
 7931
 7932        # refSseq refSeqLink
 7933        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7934        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7935
 7936        # Assembly
 7937        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7938
 7939        # Genome
 7940        genome_file = None
 7941        if find_genome(databases_genome):
 7942            genome_file = find_genome(databases_genome)
 7943        else:
 7944            genome_file = find_genome(
 7945                genome_path=databases_genomes_folders, assembly=assembly
 7946            )
 7947        log.debug("Genome: " + str(genome_file))
 7948
 7949        # refSseq
 7950        refseq_file = find_file_prefix(
 7951            input_file=databases_refseq,
 7952            prefix="ncbiRefSeq",
 7953            folder=databases_refseq_folders,
 7954            assembly=assembly,
 7955        )
 7956        log.debug("refSeq: " + str(refseq_file))
 7957
 7958        # refSeqLink
 7959        refseqlink_file = find_file_prefix(
 7960            input_file=databases_refseqlink,
 7961            prefix="ncbiRefSeqLink",
 7962            folder=databases_refseq_folders,
 7963            assembly=assembly,
 7964        )
 7965        log.debug("refSeqLink: " + str(refseqlink_file))
 7966
 7967        # Threads
 7968        if not threads:
 7969            threads = self.get_threads()
 7970        log.debug("Threads: " + str(threads))
 7971
 7972        # Variables
 7973        table_variants = self.get_table_variants(clause="update")
 7974
 7975        # Get variants SNV and InDel only
 7976        query_variants = f"""
 7977            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7978            FROM {table_variants}
 7979            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7980            """
 7981        df_variants = self.get_query_to_df(query_variants)
 7982
 7983        # Added columns
 7984        added_columns = []
 7985
 7986        # Add hgvs column in variants table
 7987        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7988        added_column = self.add_column(
 7989            table_variants, hgvs_column_name, "STRING", default_value=None
 7990        )
 7991        added_columns.append(added_column)
 7992
 7993        log.debug(f"refSeq loading...")
 7994        # refSeq in duckDB
 7995        refseq_table = get_refseq_table(
 7996            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7997        )
 7998        # Loading all refSeq in Dataframe
 7999        refseq_query = f"""
 8000            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8001            FROM {refseq_table}
 8002            JOIN df_variants ON (
 8003                {refseq_table}.chrom = df_variants.CHROM
 8004                AND {refseq_table}.txStart<=df_variants.POS
 8005                AND {refseq_table}.txEnd>=df_variants.POS
 8006            )
 8007        """
 8008        refseq_df = self.conn.query(refseq_query).pl()
 8009
 8010        if refseqlink_file:
 8011            log.debug(f"refSeqLink loading...")
 8012            # refSeqLink in duckDB
 8013            refseqlink_table = get_refseq_table(
 8014                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8015            )
 8016            # Loading all refSeqLink in Dataframe
 8017            protacc_column = "protAcc_with_ver"
 8018            mrnaacc_column = "mrnaAcc_with_ver"
 8019            refseqlink_query = f"""
 8020                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8021                FROM {refseqlink_table} 
 8022                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8023                WHERE protAcc_without_ver IS NOT NULL
 8024            """
 8025            # Polars Dataframe
 8026            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8027
 8028        # Read RefSeq transcripts into a python dict/model.
 8029        log.debug(f"Transcripts loading...")
 8030        with tempfile.TemporaryDirectory() as tmpdir:
 8031            transcripts_query = f"""
 8032                COPY (
 8033                    SELECT {refseq_table}.*
 8034                    FROM {refseq_table}
 8035                    JOIN df_variants ON (
 8036                        {refseq_table}.chrom=df_variants.CHROM
 8037                        AND {refseq_table}.txStart<=df_variants.POS
 8038                        AND {refseq_table}.txEnd>=df_variants.POS
 8039                    )
 8040                )
 8041                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8042            """
 8043            self.conn.query(transcripts_query)
 8044            with open(f"{tmpdir}/transcript.tsv") as infile:
 8045                transcripts = read_transcripts(infile)
 8046
 8047        # Polars connexion
 8048        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8049
 8050        log.debug("Genome loading...")
 8051        # Read genome sequence using pyfaidx.
 8052        genome = Fasta(genome_file)
 8053
 8054        log.debug("Start annotation HGVS...")
 8055
 8056        # Create
 8057        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8058        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8059
 8060        # Use dask.dataframe.apply() to apply function on each partition
 8061        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8062
 8063        # Convert Dask DataFrame to Pandas Dataframe
 8064        df = ddf.compute()
 8065
 8066        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8067        with tempfile.TemporaryDirectory() as tmpdir:
 8068            df_parquet = os.path.join(tmpdir, "df.parquet")
 8069            df.to_parquet(df_parquet)
 8070
 8071            # Update hgvs column
 8072            update_variant_query = f"""
 8073                UPDATE {table_variants}
 8074                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8075                FROM read_parquet('{df_parquet}') as df
 8076                WHERE variants."#CHROM" = df.CHROM
 8077                AND variants.POS = df.POS
 8078                AND variants.REF = df.REF
 8079                AND variants.ALT = df.ALT
 8080                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8081                """
 8082            self.execute_query(update_variant_query)
 8083
 8084        # Update INFO column
 8085        sql_query_update = f"""
 8086            UPDATE {table_variants}
 8087            SET INFO = 
 8088                concat(
 8089                    CASE 
 8090                        WHEN INFO NOT IN ('','.')
 8091                        THEN concat(INFO, ';')
 8092                        ELSE ''
 8093                    END,
 8094                    'hgvs=',
 8095                    {hgvs_column_name}
 8096                )
 8097            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8098            """
 8099        self.execute_query(sql_query_update)
 8100
 8101        # Add header
 8102        HGVS_INFOS = {
 8103            "hgvs": {
 8104                "ID": "hgvs",
 8105                "Number": ".",
 8106                "Type": "String",
 8107                "Description": f"HGVS annotatation with HOWARD",
 8108            }
 8109        }
 8110
 8111        for field in HGVS_INFOS:
 8112            field_ID = HGVS_INFOS[field]["ID"]
 8113            field_description = HGVS_INFOS[field]["Description"]
 8114            self.get_header().infos[field_ID] = vcf.parser._Info(
 8115                field_ID,
 8116                HGVS_INFOS[field]["Number"],
 8117                HGVS_INFOS[field]["Type"],
 8118                field_description,
 8119                "unknown",
 8120                "unknown",
 8121                code_type_map[HGVS_INFOS[field]["Type"]],
 8122            )
 8123
 8124        # Remove added columns
 8125        for added_column in added_columns:
 8126            self.drop_column(column=added_column)
 8127
 8128    ###
 8129    # Calculation
 8130    ###
 8131
 8132    def get_operations_help(
 8133        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8134    ) -> list:
 8135
 8136        # Init
 8137        operations_help = []
 8138
 8139        # operations
 8140        operations = self.get_config_json(
 8141            name="calculations",
 8142            config_dict=operations_config_dict,
 8143            config_file=operations_config_file,
 8144        )
 8145        for op in operations:
 8146            op_name = operations[op].get("name", op).upper()
 8147            op_description = operations[op].get("description", op_name)
 8148            op_available = operations[op].get("available", False)
 8149            if op_available:
 8150                operations_help.append(f"   {op_name}: {op_description}")
 8151
 8152        # Sort operations
 8153        operations_help.sort()
 8154
 8155        # insert header
 8156        operations_help.insert(0, "Available calculation operations:")
 8157
 8158        # Return
 8159        return operations_help
 8160
 8161    def calculation(
 8162        self,
 8163        operations: dict = {},
 8164        operations_config_dict: dict = {},
 8165        operations_config_file: str = None,
 8166    ) -> None:
 8167        """
 8168        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8169        operation, and then calls the appropriate function
 8170
 8171        param json example:
 8172            "calculation": {
 8173                "NOMEN": {
 8174                    "options": {
 8175                        "hgvs_field": "hgvs"
 8176                    },
 8177                "middle" : null
 8178            }
 8179        """
 8180
 8181        # Param
 8182        param = self.get_param()
 8183
 8184        # operations config
 8185        operations_config = self.get_config_json(
 8186            name="calculations",
 8187            config_dict=operations_config_dict,
 8188            config_file=operations_config_file,
 8189        )
 8190
 8191        # Upper keys
 8192        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8193
 8194        # Calculations
 8195
 8196        # Operations from param
 8197        operations = param.get("calculation", {}).get("calculations", operations)
 8198
 8199        # Quick calculation - add
 8200        if param.get("calculations", None):
 8201
 8202            # List of operations
 8203            calculations_list = [
 8204                value.strip() for value in param.get("calculations", "").split(",")
 8205            ]
 8206
 8207            # Log
 8208            log.info(f"Quick Calculations:")
 8209            for calculation_key in calculations_list:
 8210                log.info(f"   {calculation_key}")
 8211
 8212            # Create tmp operations (to keep operation order)
 8213            operations_tmp = {}
 8214            for calculation_operation in calculations_list:
 8215                if calculation_operation.upper() not in operations_tmp:
 8216                    log.debug(
 8217                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8218                    )
 8219                    operations_tmp[calculation_operation.upper()] = {}
 8220                    add_value_into_dict(
 8221                        dict_tree=operations_tmp,
 8222                        sections=[
 8223                            calculation_operation.upper(),
 8224                        ],
 8225                        value=operations.get(calculation_operation.upper(), {}),
 8226                    )
 8227            # Add operations already in param
 8228            for calculation_operation in operations:
 8229                if calculation_operation not in operations_tmp:
 8230                    operations_tmp[calculation_operation] = operations.get(
 8231                        calculation_operation, {}
 8232                    )
 8233
 8234            # Update operations in param
 8235            operations = operations_tmp
 8236
 8237        # Operations for calculation
 8238        if not operations:
 8239            operations = param.get("calculation", {}).get("calculations", {})
 8240
 8241        if operations:
 8242            log.info(f"Calculations...")
 8243
 8244        # For each operations
 8245        for operation_name in operations:
 8246            operation_name = operation_name.upper()
 8247            if operation_name not in [""]:
 8248                if operation_name in operations_config:
 8249                    log.info(f"Calculation '{operation_name}'")
 8250                    operation = operations_config[operation_name]
 8251                    operation_type = operation.get("type", "sql")
 8252                    if operation_type == "python":
 8253                        self.calculation_process_function(
 8254                            operation=operation, operation_name=operation_name
 8255                        )
 8256                    elif operation_type == "sql":
 8257                        self.calculation_process_sql(
 8258                            operation=operation, operation_name=operation_name
 8259                        )
 8260                    else:
 8261                        log.error(
 8262                            f"Operations config: Type '{operation_type}' NOT available"
 8263                        )
 8264                        raise ValueError(
 8265                            f"Operations config: Type '{operation_type}' NOT available"
 8266                        )
 8267                else:
 8268                    log.error(
 8269                        f"Operations config: Calculation '{operation_name}' NOT available"
 8270                    )
 8271                    raise ValueError(
 8272                        f"Operations config: Calculation '{operation_name}' NOT available"
 8273                    )
 8274
 8275        # Explode INFOS fields into table fields
 8276        if self.get_explode_infos():
 8277            self.explode_infos(
 8278                prefix=self.get_explode_infos_prefix(),
 8279                fields=self.get_explode_infos_fields(),
 8280                force=True,
 8281            )
 8282
 8283    def calculation_process_sql(
 8284        self, operation: dict, operation_name: str = "unknown"
 8285    ) -> None:
 8286        """
 8287        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8288        performs the operation, updating the specified table with the result.
 8289
 8290        :param operation: The `operation` parameter is a dictionary that contains information about the
 8291        mathematical operation to be performed. It includes the following keys:
 8292        :type operation: dict
 8293        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8294        the mathematical operation being performed. It is used for logging and error handling purposes,
 8295        defaults to unknown
 8296        :type operation_name: str (optional)
 8297        """
 8298
 8299        # table variants
 8300        table_variants = self.get_table_variants(clause="alter")
 8301
 8302        # Operation infos
 8303        operation_name = operation.get("name", "unknown")
 8304        log.debug(f"process sql {operation_name}")
 8305        output_column_name = operation.get("output_column_name", operation_name)
 8306        output_column_type = operation.get("output_column_type", "String")
 8307        prefix = operation.get("explode_infos_prefix", "")
 8308        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8309        output_column_description = operation.get(
 8310            "output_column_description", f"{operation_name} operation"
 8311        )
 8312        operation_query = operation.get("operation_query", None)
 8313        if isinstance(operation_query, list):
 8314            operation_query = " ".join(operation_query)
 8315        operation_info_fields = operation.get("info_fields", [])
 8316        operation_info_fields_check = operation.get("info_fields_check", False)
 8317        operation_info = operation.get("operation_info", True)
 8318
 8319        if operation_query:
 8320
 8321            # Info fields check
 8322            operation_info_fields_check_result = True
 8323            if operation_info_fields_check:
 8324                header_infos = self.get_header().infos
 8325                for info_field in operation_info_fields:
 8326                    operation_info_fields_check_result = (
 8327                        operation_info_fields_check_result
 8328                        and info_field in header_infos
 8329                    )
 8330
 8331            # If info fields available
 8332            if operation_info_fields_check_result:
 8333
 8334                # Added_columns
 8335                added_columns = []
 8336
 8337                # Create VCF header field
 8338                vcf_reader = self.get_header()
 8339                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8340                    output_column_name,
 8341                    ".",
 8342                    output_column_type,
 8343                    output_column_description,
 8344                    "howard calculation",
 8345                    "0",
 8346                    self.code_type_map.get(output_column_type),
 8347                )
 8348
 8349                # Explode infos if needed
 8350                log.debug(f"calculation_process_sql prefix {prefix}")
 8351                added_columns += self.explode_infos(
 8352                    prefix=prefix,
 8353                    fields=[output_column_name] + operation_info_fields,
 8354                    force=True,
 8355                )
 8356
 8357                # Create column
 8358                added_column = self.add_column(
 8359                    table_name=table_variants,
 8360                    column_name=prefix + output_column_name,
 8361                    column_type=output_column_type_sql,
 8362                    default_value="null",
 8363                )
 8364                added_columns.append(added_column)
 8365
 8366                # Operation calculation
 8367                try:
 8368
 8369                    # Query to update calculation column
 8370                    sql_update = f"""
 8371                        UPDATE {table_variants}
 8372                        SET "{prefix}{output_column_name}" = ({operation_query})
 8373                    """
 8374                    self.conn.execute(sql_update)
 8375
 8376                    # Add to INFO
 8377                    if operation_info:
 8378                        sql_update_info = f"""
 8379                            UPDATE {table_variants}
 8380                            SET "INFO" =
 8381                                concat(
 8382                                    CASE
 8383                                        WHEN "INFO" IS NOT NULL
 8384                                        THEN concat("INFO", ';')
 8385                                        ELSE ''
 8386                                    END,
 8387                                    '{output_column_name}=',
 8388                                    "{prefix}{output_column_name}"
 8389                                )
 8390                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8391                        """
 8392                        self.conn.execute(sql_update_info)
 8393
 8394                except:
 8395                    log.error(
 8396                        f"Operations config: Calculation '{operation_name}' query failed"
 8397                    )
 8398                    raise ValueError(
 8399                        f"Operations config: Calculation '{operation_name}' query failed"
 8400                    )
 8401
 8402                # Remove added columns
 8403                for added_column in added_columns:
 8404                    log.debug(f"added_column: {added_column}")
 8405                    self.drop_column(column=added_column)
 8406
 8407            else:
 8408                log.error(
 8409                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8410                )
 8411                raise ValueError(
 8412                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8413                )
 8414
 8415        else:
 8416            log.error(
 8417                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8418            )
 8419            raise ValueError(
 8420                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8421            )
 8422
 8423    def calculation_process_function(
 8424        self, operation: dict, operation_name: str = "unknown"
 8425    ) -> None:
 8426        """
 8427        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8428        function with the given parameters.
 8429
 8430        :param operation: The `operation` parameter is a dictionary that contains information about the
 8431        operation to be performed. It has the following keys:
 8432        :type operation: dict
 8433        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8434        the operation being performed. It is used for logging purposes, defaults to unknown
 8435        :type operation_name: str (optional)
 8436        """
 8437
 8438        operation_name = operation["name"]
 8439        log.debug(f"process sql {operation_name}")
 8440        function_name = operation["function_name"]
 8441        function_params = operation["function_params"]
 8442        getattr(self, function_name)(*function_params)
 8443
 8444    def calculation_variant_id(self) -> None:
 8445        """
 8446        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8447        updates the INFO field of a variants table with the variant ID.
 8448        """
 8449
 8450        # variant_id annotation field
 8451        variant_id_tag = self.get_variant_id_column()
 8452        added_columns = [variant_id_tag]
 8453
 8454        # variant_id hgvs tags"
 8455        vcf_infos_tags = {
 8456            variant_id_tag: "howard variant ID annotation",
 8457        }
 8458
 8459        # Variants table
 8460        table_variants = self.get_table_variants()
 8461
 8462        # Header
 8463        vcf_reader = self.get_header()
 8464
 8465        # Add variant_id to header
 8466        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8467            variant_id_tag,
 8468            ".",
 8469            "String",
 8470            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8471            "howard calculation",
 8472            "0",
 8473            self.code_type_map.get("String"),
 8474        )
 8475
 8476        # Update
 8477        sql_update = f"""
 8478            UPDATE {table_variants}
 8479            SET "INFO" = 
 8480                concat(
 8481                    CASE
 8482                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8483                        THEN ''
 8484                        ELSE concat("INFO", ';')
 8485                    END,
 8486                    '{variant_id_tag}=',
 8487                    "{variant_id_tag}"
 8488                )
 8489        """
 8490        self.conn.execute(sql_update)
 8491
 8492        # Remove added columns
 8493        for added_column in added_columns:
 8494            self.drop_column(column=added_column)
 8495
 8496    def calculation_extract_snpeff_hgvs(
 8497        self,
 8498        snpeff_hgvs: str = "snpeff_hgvs",
 8499        snpeff_field: str = "ANN",
 8500    ) -> None:
 8501        """
 8502        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8503        annotation field in a VCF file and adds them as a new column in the variants table.
 8504
 8505        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8506        function is used to specify the name of the column that will store the HGVS nomenclatures
 8507        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8508        snpeff_hgvs
 8509        :type snpeff_hgvs: str (optional)
 8510        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8511        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8512        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8513        to ANN
 8514        :type snpeff_field: str (optional)
 8515        """
 8516
 8517        # Snpeff hgvs tags
 8518        vcf_infos_tags = {
 8519            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8520        }
 8521
 8522        # Prefix
 8523        prefix = self.get_explode_infos_prefix()
 8524        if prefix:
 8525            prefix = "INFO/"
 8526
 8527        # snpEff fields
 8528        speff_ann_infos = prefix + snpeff_field
 8529        speff_hgvs_infos = prefix + snpeff_hgvs
 8530
 8531        # Variants table
 8532        table_variants = self.get_table_variants()
 8533
 8534        # Header
 8535        vcf_reader = self.get_header()
 8536
 8537        # Add columns
 8538        added_columns = []
 8539
 8540        # Explode HGVS field in column
 8541        added_columns += self.explode_infos(fields=[snpeff_field])
 8542
 8543        if snpeff_field in vcf_reader.infos:
 8544
 8545            log.debug(vcf_reader.infos[snpeff_field])
 8546
 8547            # Extract ANN header
 8548            ann_description = vcf_reader.infos[snpeff_field].desc
 8549            pattern = r"'(.+?)'"
 8550            match = re.search(pattern, ann_description)
 8551            if match:
 8552                ann_header_match = match.group(1).split(" | ")
 8553                ann_header_desc = {}
 8554                for i in range(len(ann_header_match)):
 8555                    ann_header_info = "".join(
 8556                        char for char in ann_header_match[i] if char.isalnum()
 8557                    )
 8558                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8559                if not ann_header_desc:
 8560                    raise ValueError("Invalid header description format")
 8561            else:
 8562                raise ValueError("Invalid header description format")
 8563
 8564            # Create variant id
 8565            variant_id_column = self.get_variant_id_column()
 8566            added_columns += [variant_id_column]
 8567
 8568            # Create dataframe
 8569            dataframe_snpeff_hgvs = self.get_query_to_df(
 8570                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8571            )
 8572
 8573            # Create main NOMEN column
 8574            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8575                speff_ann_infos
 8576            ].apply(
 8577                lambda x: extract_snpeff_hgvs(
 8578                    str(x), header=list(ann_header_desc.values())
 8579                )
 8580            )
 8581
 8582            # Add snpeff_hgvs to header
 8583            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8584                snpeff_hgvs,
 8585                ".",
 8586                "String",
 8587                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8588                "howard calculation",
 8589                "0",
 8590                self.code_type_map.get("String"),
 8591            )
 8592
 8593            # Update
 8594            sql_update = f"""
 8595                UPDATE variants
 8596                SET "INFO" = 
 8597                    concat(
 8598                        CASE
 8599                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8600                            THEN ''
 8601                            ELSE concat("INFO", ';')
 8602                        END,
 8603                        CASE 
 8604                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8605                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8606                            THEN concat(
 8607                                    '{snpeff_hgvs}=',
 8608                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8609                                )
 8610                            ELSE ''
 8611                        END
 8612                    )
 8613                FROM dataframe_snpeff_hgvs
 8614                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8615
 8616            """
 8617            self.conn.execute(sql_update)
 8618
 8619            # Delete dataframe
 8620            del dataframe_snpeff_hgvs
 8621            gc.collect()
 8622
 8623        else:
 8624
 8625            log.warning(
 8626                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8627            )
 8628
 8629        # Remove added columns
 8630        for added_column in added_columns:
 8631            self.drop_column(column=added_column)
 8632
 8633    def calculation_snpeff_ann_explode(
 8634        self,
 8635        uniquify: bool = True,
 8636        output_format: str = "fields",
 8637        output_prefix: str = "snpeff_",
 8638        snpeff_field: str = "ANN",
 8639    ) -> None:
 8640        """
 8641        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8642        exploding the HGVS field and updating variant information accordingly.
 8643
 8644        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8645        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8646        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8647        defaults to True
 8648        :type uniquify: bool (optional)
 8649        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8650        function specifies the format in which the output annotations will be generated. It has a
 8651        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8652        format, defaults to fields
 8653        :type output_format: str (optional)
 8654        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8655        method is used to specify the prefix that will be added to the output annotations generated
 8656        during the calculation process. This prefix helps to differentiate the newly added annotations
 8657        from existing ones in the output data. By default, the, defaults to ANN_
 8658        :type output_prefix: str (optional)
 8659        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8660        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8661        field will be processed to explode the HGVS annotations and update the variant information
 8662        accordingly, defaults to ANN
 8663        :type snpeff_field: str (optional)
 8664        """
 8665
 8666        # SnpEff annotation field
 8667        snpeff_hgvs = "snpeff_ann_explode"
 8668
 8669        # Snpeff hgvs tags
 8670        vcf_infos_tags = {
 8671            snpeff_hgvs: "Explode snpEff annotations",
 8672        }
 8673
 8674        # Prefix
 8675        prefix = self.get_explode_infos_prefix()
 8676        if prefix:
 8677            prefix = "INFO/"
 8678
 8679        # snpEff fields
 8680        speff_ann_infos = prefix + snpeff_field
 8681        speff_hgvs_infos = prefix + snpeff_hgvs
 8682
 8683        # Variants table
 8684        table_variants = self.get_table_variants()
 8685
 8686        # Header
 8687        vcf_reader = self.get_header()
 8688
 8689        # Add columns
 8690        added_columns = []
 8691
 8692        # Explode HGVS field in column
 8693        added_columns += self.explode_infos(fields=[snpeff_field])
 8694        log.debug(f"snpeff_field={snpeff_field}")
 8695        log.debug(f"added_columns={added_columns}")
 8696
 8697        if snpeff_field in vcf_reader.infos:
 8698
 8699            # Extract ANN header
 8700            ann_description = vcf_reader.infos[snpeff_field].desc
 8701            pattern = r"'(.+?)'"
 8702            match = re.search(pattern, ann_description)
 8703            if match:
 8704                ann_header_match = match.group(1).split(" | ")
 8705                ann_header = []
 8706                ann_header_desc = {}
 8707                for i in range(len(ann_header_match)):
 8708                    ann_header_info = "".join(
 8709                        char for char in ann_header_match[i] if char.isalnum()
 8710                    )
 8711                    ann_header.append(ann_header_info)
 8712                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8713                if not ann_header_desc:
 8714                    raise ValueError("Invalid header description format")
 8715            else:
 8716                raise ValueError("Invalid header description format")
 8717
 8718            # Create variant id
 8719            variant_id_column = self.get_variant_id_column()
 8720            added_columns += [variant_id_column]
 8721
 8722            # Create dataframe
 8723            dataframe_snpeff_hgvs = self.get_query_to_df(
 8724                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8725            )
 8726
 8727            # Create snpEff columns
 8728            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8729                speff_ann_infos
 8730            ].apply(
 8731                lambda x: explode_snpeff_ann(
 8732                    str(x),
 8733                    uniquify=uniquify,
 8734                    output_format=output_format,
 8735                    prefix=output_prefix,
 8736                    header=list(ann_header_desc.values()),
 8737                )
 8738            )
 8739
 8740            # Header
 8741            ann_annotations_prefix = ""
 8742            if output_format.upper() in ["JSON"]:
 8743                ann_annotations_prefix = f"{output_prefix}="
 8744                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8745                    output_prefix,
 8746                    ".",
 8747                    "String",
 8748                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8749                    + " - JSON format",
 8750                    "howard calculation",
 8751                    "0",
 8752                    self.code_type_map.get("String"),
 8753                )
 8754            else:
 8755                for ann_annotation in ann_header:
 8756                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8757                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8758                        ann_annotation_id,
 8759                        ".",
 8760                        "String",
 8761                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8762                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8763                        "howard calculation",
 8764                        "0",
 8765                        self.code_type_map.get("String"),
 8766                    )
 8767
 8768            # Update
 8769            sql_update = f"""
 8770                UPDATE variants
 8771                SET "INFO" = 
 8772                    concat(
 8773                        CASE
 8774                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8775                            THEN ''
 8776                            ELSE concat("INFO", ';')
 8777                        END,
 8778                        CASE 
 8779                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8780                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8781                            THEN concat(
 8782                                '{ann_annotations_prefix}',
 8783                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8784                                )
 8785                            ELSE ''
 8786                        END
 8787                    )
 8788                FROM dataframe_snpeff_hgvs
 8789                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8790
 8791            """
 8792            self.conn.execute(sql_update)
 8793
 8794            # Delete dataframe
 8795            del dataframe_snpeff_hgvs
 8796            gc.collect()
 8797
 8798        else:
 8799
 8800            log.warning(
 8801                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8802            )
 8803
 8804        # Remove added columns
 8805        for added_column in added_columns:
 8806            self.drop_column(column=added_column)
 8807
 8808    def calculation_extract_nomen(self) -> None:
 8809        """
 8810        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8811        """
 8812
 8813        # NOMEN field
 8814        field_nomen_dict = "NOMEN_DICT"
 8815
 8816        # NOMEN structure
 8817        nomen_dict = {
 8818            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8819            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8820            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8821            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8822            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8823            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8824            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8825            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8826            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8827            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8828        }
 8829
 8830        # Param
 8831        param = self.get_param()
 8832
 8833        # Prefix
 8834        prefix = self.get_explode_infos_prefix()
 8835
 8836        # Header
 8837        vcf_reader = self.get_header()
 8838
 8839        # Added columns
 8840        added_columns = []
 8841
 8842        # Get HGVS field
 8843        hgvs_field = (
 8844            param.get("calculation", {})
 8845            .get("calculations", {})
 8846            .get("NOMEN", {})
 8847            .get("options", {})
 8848            .get("hgvs_field", "hgvs")
 8849        )
 8850
 8851        # Get NOMEN pattern
 8852        nomen_pattern = (
 8853            param.get("calculation", {})
 8854            .get("calculations", {})
 8855            .get("NOMEN", {})
 8856            .get("options", {})
 8857            .get("pattern", None)
 8858        )
 8859
 8860        # transcripts list of preference sources
 8861        transcripts_sources = {}
 8862
 8863        # Get transcripts
 8864        transcripts_file = (
 8865            param.get("calculation", {})
 8866            .get("calculations", {})
 8867            .get("NOMEN", {})
 8868            .get("options", {})
 8869            .get("transcripts", None)
 8870        )
 8871        transcripts_file = full_path(transcripts_file)
 8872        if transcripts_file:
 8873            if os.path.exists(transcripts_file):
 8874                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8875                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 8876                transcripts_sources["file"] = transcripts_from_file
 8877            else:
 8878                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 8879                log.error(msg_err)
 8880                raise ValueError(msg_err)
 8881
 8882        # Get transcripts table
 8883        transcripts_table = (
 8884            param.get("calculation", {})
 8885            .get("calculations", {})
 8886            .get("NOMEN", {})
 8887            .get("options", {})
 8888            .get("transcripts_table", self.get_table_variants())
 8889        )
 8890        # Get transcripts column
 8891        transcripts_column = (
 8892            param.get("calculation", {})
 8893            .get("calculations", {})
 8894            .get("NOMEN", {})
 8895            .get("options", {})
 8896            .get("transcripts_column", None)
 8897        )
 8898
 8899        if transcripts_table and transcripts_column:
 8900            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 8901            # Explode if not exists
 8902            self.explode_infos(fields=[transcripts_column], table=transcripts_table)
 8903        else:
 8904            extra_field_transcript = f"NULL"
 8905
 8906        # Transcripts of preference source order
 8907        transcripts_order = (
 8908            param.get("calculation", {})
 8909            .get("calculations", {})
 8910            .get("NOMEN", {})
 8911            .get("options", {})
 8912            .get("transcripts_order", ["column", "file"])
 8913        )
 8914
 8915        # Transcripts from file
 8916        transcripts = transcripts_sources.get("file", [])
 8917
 8918        # Explode HGVS field in column
 8919        added_columns += self.explode_infos(fields=[hgvs_field])
 8920
 8921        # extra infos
 8922        extra_infos = self.get_extra_infos()
 8923        extra_field = prefix + hgvs_field
 8924
 8925        if extra_field in extra_infos:
 8926
 8927            # Create dataframe
 8928            dataframe_hgvs = self.get_query_to_df(
 8929                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 8930            )
 8931
 8932            # Create main NOMEN column
 8933            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 8934                lambda x: find_nomen(
 8935                    hgvs=x.hgvs,
 8936                    transcript=x.transcript,
 8937                    transcripts=transcripts,
 8938                    pattern=nomen_pattern,
 8939                    transcripts_source_order=transcripts_order,
 8940                ),
 8941                axis=1,
 8942            )
 8943
 8944            # Explode NOMEN Structure and create SQL set for update
 8945            sql_nomen_fields = []
 8946            for nomen_field in nomen_dict:
 8947
 8948                # Explode each field into a column
 8949                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8950                    lambda x: dict(x).get(nomen_field, "")
 8951                )
 8952
 8953                # Create VCF header field
 8954                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8955                    nomen_field,
 8956                    ".",
 8957                    "String",
 8958                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8959                    "howard calculation",
 8960                    "0",
 8961                    self.code_type_map.get("String"),
 8962                )
 8963                sql_nomen_fields.append(
 8964                    f"""
 8965                        CASE 
 8966                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8967                            THEN concat(
 8968                                    ';{nomen_field}=',
 8969                                    dataframe_hgvs."{nomen_field}"
 8970                                )
 8971                            ELSE ''
 8972                        END
 8973                    """
 8974                )
 8975
 8976            # SQL set for update
 8977            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8978
 8979            # Update
 8980            sql_update = f"""
 8981                UPDATE variants
 8982                SET "INFO" = 
 8983                    concat(
 8984                        CASE
 8985                            WHEN "INFO" IS NULL
 8986                            THEN ''
 8987                            ELSE "INFO"
 8988                        END,
 8989                        {sql_nomen_fields_set}
 8990                    )
 8991                FROM dataframe_hgvs
 8992                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8993                    AND variants."POS" = dataframe_hgvs."POS" 
 8994                    AND variants."REF" = dataframe_hgvs."REF"
 8995                    AND variants."ALT" = dataframe_hgvs."ALT"
 8996            """
 8997            self.conn.execute(sql_update)
 8998
 8999            # Delete dataframe
 9000            del dataframe_hgvs
 9001            gc.collect()
 9002
 9003        # Remove added columns
 9004        for added_column in added_columns:
 9005            self.drop_column(column=added_column)
 9006
 9007    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9008        """
 9009        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9010        pipeline/sample for a variant and updates the variant information in a VCF file.
 9011
 9012        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9013        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9014        VCF header and to update the corresponding field in the variants table, defaults to
 9015        findbypipeline
 9016        :type tag: str (optional)
 9017        """
 9018
 9019        # if FORMAT and samples
 9020        if (
 9021            "FORMAT" in self.get_header_columns_as_list()
 9022            and self.get_header_sample_list()
 9023        ):
 9024
 9025            # findbypipeline annotation field
 9026            findbypipeline_tag = tag
 9027
 9028            # VCF infos tags
 9029            vcf_infos_tags = {
 9030                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9031            }
 9032
 9033            # Prefix
 9034            prefix = self.get_explode_infos_prefix()
 9035
 9036            # Field
 9037            findbypipeline_infos = prefix + findbypipeline_tag
 9038
 9039            # Variants table
 9040            table_variants = self.get_table_variants()
 9041
 9042            # Header
 9043            vcf_reader = self.get_header()
 9044
 9045            # Create variant id
 9046            variant_id_column = self.get_variant_id_column()
 9047            added_columns = [variant_id_column]
 9048
 9049            # variant_id, FORMAT and samples
 9050            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9051                self.get_header_sample_list()
 9052            )
 9053
 9054            # Create dataframe
 9055            dataframe_findbypipeline = self.get_query_to_df(
 9056                f""" SELECT {samples_fields} FROM {table_variants} """
 9057            )
 9058
 9059            # Create findbypipeline column
 9060            dataframe_findbypipeline[findbypipeline_infos] = (
 9061                dataframe_findbypipeline.apply(
 9062                    lambda row: findbypipeline(
 9063                        row, samples=self.get_header_sample_list()
 9064                    ),
 9065                    axis=1,
 9066                )
 9067            )
 9068
 9069            # Add snpeff_hgvs to header
 9070            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9071                findbypipeline_tag,
 9072                ".",
 9073                "String",
 9074                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9075                "howard calculation",
 9076                "0",
 9077                self.code_type_map.get("String"),
 9078            )
 9079
 9080            # Update
 9081            sql_update = f"""
 9082                UPDATE variants
 9083                SET "INFO" = 
 9084                    concat(
 9085                        CASE
 9086                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9087                            THEN ''
 9088                            ELSE concat("INFO", ';')
 9089                        END,
 9090                        CASE 
 9091                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9092                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9093                            THEN concat(
 9094                                    '{findbypipeline_tag}=',
 9095                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9096                                )
 9097                            ELSE ''
 9098                        END
 9099                    )
 9100                FROM dataframe_findbypipeline
 9101                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9102            """
 9103            self.conn.execute(sql_update)
 9104
 9105            # Remove added columns
 9106            for added_column in added_columns:
 9107                self.drop_column(column=added_column)
 9108
 9109            # Delete dataframe
 9110            del dataframe_findbypipeline
 9111            gc.collect()
 9112
 9113    def calculation_genotype_concordance(self) -> None:
 9114        """
 9115        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9116        multi-caller VCF files and updates the variant information in the database.
 9117        """
 9118
 9119        # if FORMAT and samples
 9120        if (
 9121            "FORMAT" in self.get_header_columns_as_list()
 9122            and self.get_header_sample_list()
 9123        ):
 9124
 9125            # genotypeconcordance annotation field
 9126            genotypeconcordance_tag = "genotypeconcordance"
 9127
 9128            # VCF infos tags
 9129            vcf_infos_tags = {
 9130                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9131            }
 9132
 9133            # Prefix
 9134            prefix = self.get_explode_infos_prefix()
 9135
 9136            # Field
 9137            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9138
 9139            # Variants table
 9140            table_variants = self.get_table_variants()
 9141
 9142            # Header
 9143            vcf_reader = self.get_header()
 9144
 9145            # Create variant id
 9146            variant_id_column = self.get_variant_id_column()
 9147            added_columns = [variant_id_column]
 9148
 9149            # variant_id, FORMAT and samples
 9150            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9151                self.get_header_sample_list()
 9152            )
 9153
 9154            # Create dataframe
 9155            dataframe_genotypeconcordance = self.get_query_to_df(
 9156                f""" SELECT {samples_fields} FROM {table_variants} """
 9157            )
 9158
 9159            # Create genotypeconcordance column
 9160            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9161                dataframe_genotypeconcordance.apply(
 9162                    lambda row: genotypeconcordance(
 9163                        row, samples=self.get_header_sample_list()
 9164                    ),
 9165                    axis=1,
 9166                )
 9167            )
 9168
 9169            # Add genotypeconcordance to header
 9170            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9171                genotypeconcordance_tag,
 9172                ".",
 9173                "String",
 9174                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9175                "howard calculation",
 9176                "0",
 9177                self.code_type_map.get("String"),
 9178            )
 9179
 9180            # Update
 9181            sql_update = f"""
 9182                UPDATE variants
 9183                SET "INFO" = 
 9184                    concat(
 9185                        CASE
 9186                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9187                            THEN ''
 9188                            ELSE concat("INFO", ';')
 9189                        END,
 9190                        CASE
 9191                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9192                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9193                            THEN concat(
 9194                                    '{genotypeconcordance_tag}=',
 9195                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9196                                )
 9197                            ELSE ''
 9198                        END
 9199                    )
 9200                FROM dataframe_genotypeconcordance
 9201                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9202            """
 9203            self.conn.execute(sql_update)
 9204
 9205            # Remove added columns
 9206            for added_column in added_columns:
 9207                self.drop_column(column=added_column)
 9208
 9209            # Delete dataframe
 9210            del dataframe_genotypeconcordance
 9211            gc.collect()
 9212
 9213    def calculation_barcode(self, tag: str = "barcode") -> None:
 9214        """
 9215        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9216        updates the INFO field in the file with the calculated barcode values.
 9217
 9218        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9219        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9220        the default tag name is set to "barcode", defaults to barcode
 9221        :type tag: str (optional)
 9222        """
 9223
 9224        # if FORMAT and samples
 9225        if (
 9226            "FORMAT" in self.get_header_columns_as_list()
 9227            and self.get_header_sample_list()
 9228        ):
 9229
 9230            # barcode annotation field
 9231            if not tag:
 9232                tag = "barcode"
 9233
 9234            # VCF infos tags
 9235            vcf_infos_tags = {
 9236                tag: "barcode calculation (VaRank)",
 9237            }
 9238
 9239            # Prefix
 9240            prefix = self.get_explode_infos_prefix()
 9241
 9242            # Field
 9243            barcode_infos = prefix + tag
 9244
 9245            # Variants table
 9246            table_variants = self.get_table_variants()
 9247
 9248            # Header
 9249            vcf_reader = self.get_header()
 9250
 9251            # Create variant id
 9252            variant_id_column = self.get_variant_id_column()
 9253            added_columns = [variant_id_column]
 9254
 9255            # variant_id, FORMAT and samples
 9256            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9257                self.get_header_sample_list()
 9258            )
 9259
 9260            # Create dataframe
 9261            dataframe_barcode = self.get_query_to_df(
 9262                f""" SELECT {samples_fields} FROM {table_variants} """
 9263            )
 9264
 9265            # Create barcode column
 9266            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9267                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9268            )
 9269
 9270            # Add barcode to header
 9271            vcf_reader.infos[tag] = vcf.parser._Info(
 9272                tag,
 9273                ".",
 9274                "String",
 9275                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9276                "howard calculation",
 9277                "0",
 9278                self.code_type_map.get("String"),
 9279            )
 9280
 9281            # Update
 9282            sql_update = f"""
 9283                UPDATE {table_variants}
 9284                SET "INFO" = 
 9285                    concat(
 9286                        CASE
 9287                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9288                            THEN ''
 9289                            ELSE concat("INFO", ';')
 9290                        END,
 9291                        CASE
 9292                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9293                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9294                            THEN concat(
 9295                                    '{tag}=',
 9296                                    dataframe_barcode."{barcode_infos}"
 9297                                )
 9298                            ELSE ''
 9299                        END
 9300                    )
 9301                FROM dataframe_barcode
 9302                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9303            """
 9304            self.conn.execute(sql_update)
 9305
 9306            # Remove added columns
 9307            for added_column in added_columns:
 9308                self.drop_column(column=added_column)
 9309
 9310            # Delete dataframe
 9311            del dataframe_barcode
 9312            gc.collect()
 9313
 9314    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9315        """
 9316        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9317        and updates the INFO field in the file with the calculated barcode values.
 9318
 9319        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9320        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9321        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9322        :type tag: str (optional)
 9323        """
 9324
 9325        # if FORMAT and samples
 9326        if (
 9327            "FORMAT" in self.get_header_columns_as_list()
 9328            and self.get_header_sample_list()
 9329        ):
 9330
 9331            # barcode annotation field
 9332            if not tag:
 9333                tag = "BCF"
 9334
 9335            # VCF infos tags
 9336            vcf_infos_tags = {
 9337                tag: "barcode family calculation",
 9338                f"{tag}S": "barcode family samples",
 9339            }
 9340
 9341            # Param
 9342            param = self.get_param()
 9343            log.debug(f"param={param}")
 9344
 9345            # Prefix
 9346            prefix = self.get_explode_infos_prefix()
 9347
 9348            # PED param
 9349            ped = (
 9350                param.get("calculation", {})
 9351                .get("calculations", {})
 9352                .get("BARCODEFAMILY", {})
 9353                .get("family_pedigree", None)
 9354            )
 9355            log.debug(f"ped={ped}")
 9356
 9357            # Load PED
 9358            if ped:
 9359
 9360                # Pedigree is a file
 9361                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9362                    log.debug("Pedigree is file")
 9363                    with open(full_path(ped)) as ped:
 9364                        ped = json.load(ped)
 9365
 9366                # Pedigree is a string
 9367                elif isinstance(ped, str):
 9368                    log.debug("Pedigree is str")
 9369                    try:
 9370                        ped = json.loads(ped)
 9371                        log.debug("Pedigree is json str")
 9372                    except ValueError as e:
 9373                        ped_samples = ped.split(",")
 9374                        ped = {}
 9375                        for ped_sample in ped_samples:
 9376                            ped[ped_sample] = ped_sample
 9377
 9378                # Pedigree is a dict
 9379                elif isinstance(ped, dict):
 9380                    log.debug("Pedigree is dict")
 9381
 9382                # Pedigree is not well formatted
 9383                else:
 9384                    msg_error = "Pedigree not well formatted"
 9385                    log.error(msg_error)
 9386                    raise ValueError(msg_error)
 9387
 9388                # Construct list
 9389                ped_samples = list(ped.values())
 9390
 9391            else:
 9392                log.debug("Pedigree not defined. Take all samples")
 9393                ped_samples = self.get_header_sample_list()
 9394                ped = {}
 9395                for ped_sample in ped_samples:
 9396                    ped[ped_sample] = ped_sample
 9397
 9398            # Check pedigree
 9399            if not ped or len(ped) == 0:
 9400                msg_error = f"Error in pedigree: samples {ped_samples}"
 9401                log.error(msg_error)
 9402                raise ValueError(msg_error)
 9403
 9404            # Log
 9405            log.info(
 9406                "Calculation 'BARCODEFAMILY' - Samples: "
 9407                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9408            )
 9409            log.debug(f"ped_samples={ped_samples}")
 9410
 9411            # Field
 9412            barcode_infos = prefix + tag
 9413
 9414            # Variants table
 9415            table_variants = self.get_table_variants()
 9416
 9417            # Header
 9418            vcf_reader = self.get_header()
 9419
 9420            # Create variant id
 9421            variant_id_column = self.get_variant_id_column()
 9422            added_columns = [variant_id_column]
 9423
 9424            # variant_id, FORMAT and samples
 9425            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9426                ped_samples
 9427            )
 9428
 9429            # Create dataframe
 9430            dataframe_barcode = self.get_query_to_df(
 9431                f""" SELECT {samples_fields} FROM {table_variants} """
 9432            )
 9433
 9434            # Create barcode column
 9435            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9436                lambda row: barcode(row, samples=ped_samples), axis=1
 9437            )
 9438
 9439            # Add barcode family to header
 9440            # Add vaf_normalization to header
 9441            vcf_reader.formats[tag] = vcf.parser._Format(
 9442                id=tag,
 9443                num=".",
 9444                type="String",
 9445                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9446                type_code=self.code_type_map.get("String"),
 9447            )
 9448            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9449                id=f"{tag}S",
 9450                num=".",
 9451                type="String",
 9452                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9453                type_code=self.code_type_map.get("String"),
 9454            )
 9455
 9456            # Update
 9457            # for sample in ped_samples:
 9458            sql_update_set = []
 9459            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9460                if sample in ped_samples:
 9461                    value = f'dataframe_barcode."{barcode_infos}"'
 9462                    value_samples = "'" + ",".join(ped_samples) + "'"
 9463                elif sample == "FORMAT":
 9464                    value = f"'{tag}'"
 9465                    value_samples = f"'{tag}S'"
 9466                else:
 9467                    value = "'.'"
 9468                    value_samples = "'.'"
 9469                format_regex = r"[a-zA-Z0-9\s]"
 9470                sql_update_set.append(
 9471                    f"""
 9472                        "{sample}" = 
 9473                        concat(
 9474                            CASE
 9475                                WHEN {table_variants}."{sample}" = './.'
 9476                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9477                                ELSE {table_variants}."{sample}"
 9478                            END,
 9479                            ':',
 9480                            {value},
 9481                            ':',
 9482                            {value_samples}
 9483                        )
 9484                    """
 9485                )
 9486
 9487            sql_update_set_join = ", ".join(sql_update_set)
 9488            sql_update = f"""
 9489                UPDATE {table_variants}
 9490                SET {sql_update_set_join}
 9491                FROM dataframe_barcode
 9492                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9493            """
 9494            self.conn.execute(sql_update)
 9495
 9496            # Remove added columns
 9497            for added_column in added_columns:
 9498                self.drop_column(column=added_column)
 9499
 9500            # Delete dataframe
 9501            del dataframe_barcode
 9502            gc.collect()
 9503
 9504    def calculation_trio(self) -> None:
 9505        """
 9506        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9507        information to the INFO field of each variant.
 9508        """
 9509
 9510        # if FORMAT and samples
 9511        if (
 9512            "FORMAT" in self.get_header_columns_as_list()
 9513            and self.get_header_sample_list()
 9514        ):
 9515
 9516            # trio annotation field
 9517            trio_tag = "trio"
 9518
 9519            # VCF infos tags
 9520            vcf_infos_tags = {
 9521                "trio": "trio calculation",
 9522            }
 9523
 9524            # Param
 9525            param = self.get_param()
 9526
 9527            # Prefix
 9528            prefix = self.get_explode_infos_prefix()
 9529
 9530            # Trio param
 9531            trio_ped = (
 9532                param.get("calculation", {})
 9533                .get("calculations", {})
 9534                .get("TRIO", {})
 9535                .get("trio_pedigree", None)
 9536            )
 9537
 9538            # Load trio
 9539            if trio_ped:
 9540
 9541                # Trio pedigree is a file
 9542                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9543                    log.debug("TRIO pedigree is file")
 9544                    with open(full_path(trio_ped)) as trio_ped:
 9545                        trio_ped = json.load(trio_ped)
 9546
 9547                # Trio pedigree is a string
 9548                elif isinstance(trio_ped, str):
 9549                    log.debug("TRIO pedigree is str")
 9550                    try:
 9551                        trio_ped = json.loads(trio_ped)
 9552                        log.debug("TRIO pedigree is json str")
 9553                    except ValueError as e:
 9554                        trio_samples = trio_ped.split(",")
 9555                        if len(trio_samples) == 3:
 9556                            trio_ped = {
 9557                                "father": trio_samples[0],
 9558                                "mother": trio_samples[1],
 9559                                "child": trio_samples[2],
 9560                            }
 9561                            log.debug("TRIO pedigree is list str")
 9562                        else:
 9563                            msg_error = "TRIO pedigree not well formatted"
 9564                            log.error(msg_error)
 9565                            raise ValueError(msg_error)
 9566
 9567                # Trio pedigree is a dict
 9568                elif isinstance(trio_ped, dict):
 9569                    log.debug("TRIO pedigree is dict")
 9570
 9571                # Trio pedigree is not well formatted
 9572                else:
 9573                    msg_error = "TRIO pedigree not well formatted"
 9574                    log.error(msg_error)
 9575                    raise ValueError(msg_error)
 9576
 9577                # Construct trio list
 9578                trio_samples = [
 9579                    trio_ped.get("father", ""),
 9580                    trio_ped.get("mother", ""),
 9581                    trio_ped.get("child", ""),
 9582                ]
 9583
 9584            else:
 9585                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9586                samples_list = self.get_header_sample_list()
 9587                if len(samples_list) >= 3:
 9588                    trio_samples = self.get_header_sample_list()[0:3]
 9589                    trio_ped = {
 9590                        "father": trio_samples[0],
 9591                        "mother": trio_samples[1],
 9592                        "child": trio_samples[2],
 9593                    }
 9594                else:
 9595                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9596                    log.error(msg_error)
 9597                    raise ValueError(msg_error)
 9598
 9599            # Check trio pedigree
 9600            if not trio_ped or len(trio_ped) != 3:
 9601                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9602                log.error(msg_error)
 9603                raise ValueError(msg_error)
 9604
 9605            # Log
 9606            log.info(
 9607                f"Calculation 'TRIO' - Samples: "
 9608                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9609            )
 9610
 9611            # Field
 9612            trio_infos = prefix + trio_tag
 9613
 9614            # Variants table
 9615            table_variants = self.get_table_variants()
 9616
 9617            # Header
 9618            vcf_reader = self.get_header()
 9619
 9620            # Create variant id
 9621            variant_id_column = self.get_variant_id_column()
 9622            added_columns = [variant_id_column]
 9623
 9624            # variant_id, FORMAT and samples
 9625            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9626                self.get_header_sample_list()
 9627            )
 9628
 9629            # Create dataframe
 9630            dataframe_trio = self.get_query_to_df(
 9631                f""" SELECT {samples_fields} FROM {table_variants} """
 9632            )
 9633
 9634            # Create trio column
 9635            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9636                lambda row: trio(row, samples=trio_samples), axis=1
 9637            )
 9638
 9639            # Add trio to header
 9640            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9641                trio_tag,
 9642                ".",
 9643                "String",
 9644                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9645                "howard calculation",
 9646                "0",
 9647                self.code_type_map.get("String"),
 9648            )
 9649
 9650            # Update
 9651            sql_update = f"""
 9652                UPDATE {table_variants}
 9653                SET "INFO" = 
 9654                    concat(
 9655                        CASE
 9656                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9657                            THEN ''
 9658                            ELSE concat("INFO", ';')
 9659                        END,
 9660                        CASE
 9661                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9662                             AND dataframe_trio."{trio_infos}" NOT NULL
 9663                            THEN concat(
 9664                                    '{trio_tag}=',
 9665                                    dataframe_trio."{trio_infos}"
 9666                                )
 9667                            ELSE ''
 9668                        END
 9669                    )
 9670                FROM dataframe_trio
 9671                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9672            """
 9673            self.conn.execute(sql_update)
 9674
 9675            # Remove added columns
 9676            for added_column in added_columns:
 9677                self.drop_column(column=added_column)
 9678
 9679            # Delete dataframe
 9680            del dataframe_trio
 9681            gc.collect()
 9682
 9683    def calculation_vaf_normalization(self) -> None:
 9684        """
 9685        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9686        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9687        :return: The function does not return anything.
 9688        """
 9689
 9690        # if FORMAT and samples
 9691        if (
 9692            "FORMAT" in self.get_header_columns_as_list()
 9693            and self.get_header_sample_list()
 9694        ):
 9695
 9696            # vaf_normalization annotation field
 9697            vaf_normalization_tag = "VAF"
 9698
 9699            # VCF infos tags
 9700            vcf_infos_tags = {
 9701                "VAF": "VAF Variant Frequency",
 9702            }
 9703
 9704            # Prefix
 9705            prefix = self.get_explode_infos_prefix()
 9706
 9707            # Variants table
 9708            table_variants = self.get_table_variants()
 9709
 9710            # Header
 9711            vcf_reader = self.get_header()
 9712
 9713            # Do not calculate if VAF already exists
 9714            if "VAF" in vcf_reader.formats:
 9715                log.debug("VAF already on genotypes")
 9716                return
 9717
 9718            # Create variant id
 9719            variant_id_column = self.get_variant_id_column()
 9720            added_columns = [variant_id_column]
 9721
 9722            # variant_id, FORMAT and samples
 9723            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9724                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9725            )
 9726
 9727            # Create dataframe
 9728            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9729            log.debug(f"query={query}")
 9730            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9731
 9732            vaf_normalization_set = []
 9733
 9734            # for each sample vaf_normalization
 9735            for sample in self.get_header_sample_list():
 9736                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9737                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9738                )
 9739                vaf_normalization_set.append(
 9740                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9741                )
 9742
 9743            # Add VAF to FORMAT
 9744            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9745                "FORMAT"
 9746            ].apply(lambda x: str(x) + ":VAF")
 9747            vaf_normalization_set.append(
 9748                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9749            )
 9750
 9751            # Add vaf_normalization to header
 9752            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9753                id=vaf_normalization_tag,
 9754                num="1",
 9755                type="Float",
 9756                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9757                type_code=self.code_type_map.get("Float"),
 9758            )
 9759
 9760            # Create fields to add in INFO
 9761            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9762
 9763            # Update
 9764            sql_update = f"""
 9765                UPDATE {table_variants}
 9766                SET {sql_vaf_normalization_set}
 9767                FROM dataframe_vaf_normalization
 9768                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9769
 9770            """
 9771            self.conn.execute(sql_update)
 9772
 9773            # Remove added columns
 9774            for added_column in added_columns:
 9775                self.drop_column(column=added_column)
 9776
 9777            # Delete dataframe
 9778            del dataframe_vaf_normalization
 9779            gc.collect()
 9780
 9781    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9782        """
 9783        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9784        field in a VCF file and updates the INFO column of the variants table with the calculated
 9785        statistics.
 9786
 9787        :param info: The `info` parameter is a string that represents the type of information for which
 9788        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9789        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9790        maximum value, the mean, the median, defaults to VAF
 9791        :type info: str (optional)
 9792        """
 9793
 9794        # if FORMAT and samples
 9795        if (
 9796            "FORMAT" in self.get_header_columns_as_list()
 9797            and self.get_header_sample_list()
 9798        ):
 9799
 9800            # vaf_stats annotation field
 9801            vaf_stats_tag = info + "_stats"
 9802
 9803            # VCF infos tags
 9804            vcf_infos_tags = {
 9805                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9806                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9807                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9808                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9809                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9810                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9811                info
 9812                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9813            }
 9814
 9815            # Prefix
 9816            prefix = self.get_explode_infos_prefix()
 9817
 9818            # Field
 9819            vaf_stats_infos = prefix + vaf_stats_tag
 9820
 9821            # Variants table
 9822            table_variants = self.get_table_variants()
 9823
 9824            # Header
 9825            vcf_reader = self.get_header()
 9826
 9827            # Create variant id
 9828            variant_id_column = self.get_variant_id_column()
 9829            added_columns = [variant_id_column]
 9830
 9831            # variant_id, FORMAT and samples
 9832            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9833                self.get_header_sample_list()
 9834            )
 9835
 9836            # Create dataframe
 9837            dataframe_vaf_stats = self.get_query_to_df(
 9838                f""" SELECT {samples_fields} FROM {table_variants} """
 9839            )
 9840
 9841            # Create vaf_stats column
 9842            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9843                lambda row: genotype_stats(
 9844                    row, samples=self.get_header_sample_list(), info=info
 9845                ),
 9846                axis=1,
 9847            )
 9848
 9849            # List of vcf tags
 9850            sql_vaf_stats_fields = []
 9851
 9852            # Check all VAF stats infos
 9853            for stat in vcf_infos_tags:
 9854
 9855                # Extract stats
 9856                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9857                    lambda x: dict(x).get(stat, "")
 9858                )
 9859
 9860                # Add snpeff_hgvs to header
 9861                vcf_reader.infos[stat] = vcf.parser._Info(
 9862                    stat,
 9863                    ".",
 9864                    "String",
 9865                    vcf_infos_tags.get(stat, "genotype statistics"),
 9866                    "howard calculation",
 9867                    "0",
 9868                    self.code_type_map.get("String"),
 9869                )
 9870
 9871                if len(sql_vaf_stats_fields):
 9872                    sep = ";"
 9873                else:
 9874                    sep = ""
 9875
 9876                # Create fields to add in INFO
 9877                sql_vaf_stats_fields.append(
 9878                    f"""
 9879                        CASE
 9880                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9881                            THEN concat(
 9882                                    '{sep}{stat}=',
 9883                                    dataframe_vaf_stats."{stat}"
 9884                                )
 9885                            ELSE ''
 9886                        END
 9887                    """
 9888                )
 9889
 9890            # SQL set for update
 9891            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9892
 9893            # Update
 9894            sql_update = f"""
 9895                UPDATE {table_variants}
 9896                SET "INFO" = 
 9897                    concat(
 9898                        CASE
 9899                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9900                            THEN ''
 9901                            ELSE concat("INFO", ';')
 9902                        END,
 9903                        {sql_vaf_stats_fields_set}
 9904                    )
 9905                FROM dataframe_vaf_stats
 9906                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9907
 9908            """
 9909            self.conn.execute(sql_update)
 9910
 9911            # Remove added columns
 9912            for added_column in added_columns:
 9913                self.drop_column(column=added_column)
 9914
 9915            # Delete dataframe
 9916            del dataframe_vaf_stats
 9917            gc.collect()
 9918
 9919    def calculation_transcripts_annotation(
 9920        self, info_json: str = None, info_format: str = None
 9921    ) -> None:
 9922        """
 9923        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9924        field to it if transcripts are available.
 9925
 9926        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9927        is a string parameter that represents the information field to be used in the transcripts JSON.
 9928        It is used to specify the JSON format for the transcripts information. If no value is provided
 9929        when calling the method, it defaults to "
 9930        :type info_json: str
 9931        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9932        method is a string parameter that specifies the format of the information field to be used in
 9933        the transcripts JSON. It is used to define the format of the information field
 9934        :type info_format: str
 9935        """
 9936
 9937        # Create transcripts table
 9938        transcripts_table = self.create_transcript_view()
 9939
 9940        # Add info field
 9941        if transcripts_table:
 9942            self.transcript_view_to_variants(
 9943                transcripts_table=transcripts_table,
 9944                transcripts_info_field_json=info_json,
 9945                transcripts_info_field_format=info_format,
 9946            )
 9947        else:
 9948            log.info("No Transcripts to process. Check param.json file configuration")
 9949
 9950    def calculation_transcripts_prioritization(self) -> None:
 9951        """
 9952        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9953        prioritizes transcripts based on certain criteria.
 9954        """
 9955
 9956        # Create transcripts table
 9957        transcripts_table = self.create_transcript_view()
 9958
 9959        # Add info field
 9960        if transcripts_table:
 9961            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9962        else:
 9963            log.info("No Transcripts to process. Check param.json file configuration")
 9964
 9965    def calculation_transcripts_export(self) -> None:
 9966        """ """
 9967
 9968        # Create transcripts table
 9969        transcripts_table = self.create_transcript_view()
 9970
 9971        # Add info field
 9972        if transcripts_table:
 9973            self.transcripts_export(transcripts_table=transcripts_table)
 9974        else:
 9975            log.info("No Transcripts to process. Check param.json file configuration")
 9976
 9977    ###############
 9978    # Transcripts #
 9979    ###############
 9980
 9981    def transcripts_export(
 9982        self, transcripts_table: str = None, param: dict = {}
 9983    ) -> bool:
 9984        """ """
 9985
 9986        log.debug("Start transcripts export...")
 9987
 9988        # Param
 9989        if not param:
 9990            param = self.get_param()
 9991
 9992        # Param export
 9993        param_transcript_export = param.get("transcripts", {}).get("export", {})
 9994
 9995        # Output file
 9996        transcripts_export_output = param_transcript_export.get("output", None)
 9997
 9998        if not param_transcript_export or not transcripts_export_output:
 9999            log.warning(f"No transcriipts export parameters defined!")
10000            return False
10001
10002        # List of transcripts annotations
10003        query_describe = f"""
10004            SELECT column_name
10005            FROM (
10006                    DESCRIBE SELECT * FROM {transcripts_table}
10007                )
10008            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10009        """
10010        transcripts_annotations_list = list(
10011            self.get_query_to_df(query=query_describe)["column_name"]
10012        )
10013
10014        # Create transcripts table for export
10015        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10016            random.choices(string.ascii_uppercase + string.digits, k=10)
10017        )
10018        query_create_transcripts_table_export = f"""
10019            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10020        """
10021        self.execute_query(query=query_create_transcripts_table_export)
10022
10023        # Output file format
10024        transcripts_export_output_format = get_file_format(
10025            filename=transcripts_export_output
10026        )
10027
10028        # Format VCF - construct INFO
10029        if transcripts_export_output_format in ["vcf"]:
10030
10031            # Construct query update INFO and header
10032            query_update_info = []
10033            for field in transcripts_annotations_list:
10034
10035                # If field not in header
10036                if field not in self.get_header_infos_list():
10037
10038                    # Add PZ Transcript in header
10039                    self.get_header().infos[field] = vcf.parser._Info(
10040                        field,
10041                        ".",
10042                        "String",
10043                        f"Annotation '{field}' from transcript view",
10044                        "unknown",
10045                        "unknown",
10046                        0,
10047                    )
10048
10049                # Add field as INFO/tag
10050                query_update_info.append(
10051                    f"""
10052                        CASE
10053                            WHEN "{field}" IS NOT NULL
10054                            THEN concat('{field}=', "{field}", ';')    
10055                            ELSE ''     
10056                        END
10057                        """
10058                )
10059
10060            # Query param
10061            query_update_info_value = (
10062                f""" concat('',  {", ".join(query_update_info)}) """
10063            )
10064            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10065
10066        else:
10067
10068            # Query param
10069            query_update_info_value = f""" NULL """
10070            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10071
10072        # Update query INFO column
10073        query_update = f"""
10074            UPDATE {transcripts_table_export}
10075            SET INFO = {query_update_info_value}
10076
10077        """
10078        self.execute_query(query=query_update)
10079
10080        # Export
10081        self.export_output(
10082            output_file=transcripts_export_output,
10083            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10084        )
10085
10086        # Drop transcripts export table
10087        query_drop_transcripts_table_export = f"""
10088            DROP TABLE {transcripts_table_export}
10089        """
10090        self.execute_query(query=query_drop_transcripts_table_export)
10091
10092    def transcripts_prioritization(
10093        self, transcripts_table: str = None, param: dict = {}
10094    ) -> bool:
10095        """
10096        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10097        and updates the variants table with the prioritized information.
10098
10099        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10100        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10101        This parameter is used to identify the table where the transcripts data is stored for the
10102        prioritization process
10103        :type transcripts_table: str
10104        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10105        that contains various configuration settings for the prioritization process of transcripts. It
10106        is used to customize the behavior of the prioritization algorithm and includes settings such as
10107        the prefix for prioritization fields, default profiles, and other
10108        :type param: dict
10109        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10110        transcripts prioritization process is successfully completed, and `False` if there are any
10111        issues or if no profile is defined for transcripts prioritization.
10112        """
10113
10114        log.debug("Start transcripts prioritization...")
10115
10116        # Param
10117        if not param:
10118            param = self.get_param()
10119
10120        # Variants table
10121        table_variants = self.get_table_variants()
10122
10123        # Transcripts table
10124        if transcripts_table is None:
10125            transcripts_table = self.create_transcript_view(
10126                transcripts_table="transcripts", param=param
10127            )
10128        if transcripts_table is None:
10129            msg_err = "No Transcripts table availalble"
10130            log.error(msg_err)
10131            raise ValueError(msg_err)
10132        log.debug(f"transcripts_table={transcripts_table}")
10133
10134        # Get transcripts columns
10135        columns_as_list_query = f"""
10136            DESCRIBE {transcripts_table}
10137        """
10138        columns_as_list = list(
10139            self.get_query_to_df(columns_as_list_query)["column_name"]
10140        )
10141
10142        # Create INFO if not exists
10143        if "INFO" not in columns_as_list:
10144            query_add_info = f"""
10145                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10146            """
10147            self.execute_query(query_add_info)
10148
10149        # Prioritization param and Force only PZ Score and Flag
10150        pz_param = param.get("transcripts", {}).get("prioritization", {})
10151
10152        # PZ profile by default
10153        pz_profile_default = (
10154            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10155        )
10156
10157        # Exit if no profile
10158        if pz_profile_default is None:
10159            log.warning("No profile defined for transcripts prioritization")
10160            return False
10161
10162        # PZ fields
10163        pz_param_pzfields = {}
10164
10165        # PZ field transcripts
10166        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10167
10168        # Add PZ Transcript in header
10169        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10170            pz_fields_transcripts,
10171            ".",
10172            "String",
10173            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10174            "unknown",
10175            "unknown",
10176            code_type_map["String"],
10177        )
10178
10179        # Mandatory fields
10180        pz_mandatory_fields_list = [
10181            "Score",
10182            "Flag",
10183            "Tags",
10184            "Comment",
10185            "Infos",
10186            "Class",
10187        ]
10188        pz_mandatory_fields = []
10189        for pz_mandatory_field in pz_mandatory_fields_list:
10190            pz_mandatory_fields.append(
10191                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10192            )
10193
10194        # PZ fields in param
10195        for pz_field in pz_param.get("pzfields", []):
10196            if pz_field in pz_mandatory_fields_list:
10197                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10198                    pz_param.get("pzprefix", "PTZ") + pz_field
10199                )
10200            else:
10201                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10202                pz_param_pzfields[pz_field] = pz_field_new
10203
10204                # Add PZ Transcript in header
10205                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10206                    pz_field_new,
10207                    ".",
10208                    "String",
10209                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10210                    "unknown",
10211                    "unknown",
10212                    code_type_map["String"],
10213                )
10214
10215        # PZ fields param
10216        pz_param["pzfields"] = pz_mandatory_fields
10217
10218        # Prioritization
10219        prioritization_result = self.prioritization(
10220            table=transcripts_table,
10221            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10222        )
10223        if not prioritization_result:
10224            log.warning("Transcripts prioritization not processed")
10225            return False
10226
10227        # PZ fields sql query
10228        query_update_select_list = []
10229        query_update_concat_list = []
10230        query_update_order_list = []
10231        for pz_param_pzfield in set(
10232            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10233        ):
10234            query_update_select_list.append(f" {pz_param_pzfield}, ")
10235
10236        for pz_param_pzfield in pz_param_pzfields:
10237            query_update_concat_list.append(
10238                f"""
10239                    , CASE 
10240                        WHEN {pz_param_pzfield} IS NOT NULL
10241                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10242                        ELSE ''
10243                    END
10244                """
10245            )
10246
10247        # Order by
10248        pz_orders = (
10249            param.get("transcripts", {})
10250            .get("prioritization", {})
10251            .get("prioritization_transcripts_order", {})
10252        )
10253        if not pz_orders:
10254            pz_orders = {
10255                pz_param.get("pzprefix", "PTZ") + "Flag": "ASC",
10256                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10257            }
10258        for pz_order in pz_orders:
10259            query_update_order_list.append(
10260                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10261            )
10262
10263        # Fields to explode
10264        fields_to_explode = (
10265            list(pz_param_pzfields.keys())
10266            + pz_mandatory_fields
10267            + list(pz_orders.keys())
10268        )
10269        # Remove transcript column as a specific transcript column
10270        if "transcript" in fields_to_explode:
10271            fields_to_explode.remove("transcript")
10272
10273        # Fields intranscripts table
10274        query_transcripts_table = f"""
10275            DESCRIBE SELECT * FROM {transcripts_table}
10276        """
10277        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10278
10279        # Check fields to explode
10280        for field_to_explode in fields_to_explode:
10281            if field_to_explode not in self.get_header_infos_list() + list(
10282                query_transcripts_table.column_name
10283            ):
10284                msg_err = f"INFO/{field_to_explode} NOT IN header"
10285                log.error(msg_err)
10286                raise ValueError(msg_err)
10287
10288        # Explode fields to explode
10289        self.explode_infos(
10290            table=transcripts_table,
10291            fields=fields_to_explode,
10292        )
10293
10294        # Transcript preference file
10295        transcripts_preference_file = (
10296            param.get("transcripts", {})
10297            .get("prioritization", {})
10298            .get("prioritization_transcripts", {})
10299        )
10300        transcripts_preference_file = full_path(transcripts_preference_file)
10301
10302        # Transcript preference forced
10303        transcript_preference_force = (
10304            param.get("transcripts", {})
10305            .get("prioritization", {})
10306            .get("prioritization_transcripts_force", False)
10307        )
10308        # Transcript version forced
10309        transcript_version_force = (
10310            param.get("transcripts", {})
10311            .get("prioritization", {})
10312            .get("prioritization_transcripts_version_force", False)
10313        )
10314
10315        # Transcripts Ranking
10316        if transcripts_preference_file:
10317
10318            # Transcripts file to dataframe
10319            if os.path.exists(transcripts_preference_file):
10320                transcripts_preference_dataframe = transcripts_file_to_df(
10321                    transcripts_preference_file
10322                )
10323            else:
10324                log.error(
10325                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10326                )
10327                raise ValueError(
10328                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10329                )
10330
10331            # Order by depending to transcript preference forcing
10332            if transcript_preference_force:
10333                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10334            else:
10335                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10336
10337            # Transcript columns joined depend on version consideration
10338            if transcript_version_force:
10339                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10340            else:
10341                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10342
10343            # Query ranking for update
10344            query_update_ranking = f"""
10345                SELECT
10346                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10347                    ROW_NUMBER() OVER (
10348                        PARTITION BY "#CHROM", POS, REF, ALT
10349                        ORDER BY {order_by}
10350                    ) AS rn
10351                FROM {transcripts_table}
10352                LEFT JOIN 
10353                    (
10354                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10355                        FROM transcripts_preference_dataframe
10356                    ) AS transcripts_preference
10357                ON {transcripts_version_join}
10358            """
10359
10360        else:
10361
10362            # Query ranking for update
10363            query_update_ranking = f"""
10364                SELECT
10365                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10366                    ROW_NUMBER() OVER (
10367                        PARTITION BY "#CHROM", POS, REF, ALT
10368                        ORDER BY {" , ".join(query_update_order_list)}
10369                    ) AS rn
10370                FROM {transcripts_table}
10371            """
10372
10373        # Export Transcripts prioritization infos to variants table
10374        query_update = f"""
10375            WITH RankedTranscripts AS (
10376                {query_update_ranking}
10377            )
10378            UPDATE {table_variants}
10379                SET
10380                INFO = CONCAT(CASE
10381                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10382                            THEN ''
10383                            ELSE concat("INFO", ';')
10384                        END,
10385                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10386                        )
10387            FROM
10388                RankedTranscripts
10389            WHERE
10390                rn = 1
10391                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10392                AND variants."POS" = RankedTranscripts."POS"
10393                AND variants."REF" = RankedTranscripts."REF"
10394                AND variants."ALT" = RankedTranscripts."ALT"     
10395        """
10396
10397        # log.debug(f"query_update={query_update}")
10398        self.execute_query(query=query_update)
10399
10400        # Return
10401        return True
10402
10403    def create_transcript_view_from_columns_map(
10404        self,
10405        transcripts_table: str = "transcripts",
10406        columns_maps: dict = {},
10407        added_columns: list = [],
10408        temporary_tables: list = None,
10409        annotation_fields: list = None,
10410        column_rename: dict = {},
10411        column_clean: bool = False,
10412        column_case: str = None,
10413    ) -> tuple[list, list, list]:
10414        """
10415        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10416        specified columns mapping for transcripts data.
10417
10418        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10419        of the table where the transcripts data is stored or will be stored in the database. This table
10420        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10421        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10422        :type transcripts_table: str (optional)
10423        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10424        about how to map columns from a transcripts table to create a view. Each entry in the
10425        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10426        typically includes details such as the main transcript column and additional information columns
10427        :type columns_maps: dict
10428        :param added_columns: The `added_columns` parameter in the
10429        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10430        that will be added to the view being created based on the columns map provided. These columns
10431        are generated by exploding the transcript information columns along with the main transcript
10432        column
10433        :type added_columns: list
10434        :param temporary_tables: The `temporary_tables` parameter in the
10435        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10436        tables created during the process of creating a transcript view from a columns map. These
10437        temporary tables are used to store intermediate results or transformations before the final view
10438        is generated
10439        :type temporary_tables: list
10440        :param annotation_fields: The `annotation_fields` parameter in the
10441        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10442        used for annotation in the query view creation process. These fields are extracted from the
10443        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10444        :type annotation_fields: list
10445        :param column_rename: The `column_rename` parameter in the
10446        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10447        custom renaming for columns during the creation of the temporary table view. This parameter
10448        provides a mapping of original column names to the desired renamed column names. By using this
10449        parameter,
10450        :type column_rename: dict
10451        :param column_clean: The `column_clean` parameter in the
10452        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10453        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10454        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10455        False
10456        :type column_clean: bool (optional)
10457        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10458        function is used to specify the case transformation to be applied to the columns during the view
10459        creation process. It allows you to control whether the column values should be converted to
10460        lowercase, uppercase, or remain unchanged
10461        :type column_case: str
10462        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10463        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10464        """
10465
10466        log.debug("Start transcrpts view creation from columns map...")
10467
10468        # "from_columns_map": [
10469        #     {
10470        #         "transcripts_column": "Ensembl_transcriptid",
10471        #         "transcripts_infos_columns": [
10472        #             "genename",
10473        #             "Ensembl_geneid",
10474        #             "LIST_S2_score",
10475        #             "LIST_S2_pred",
10476        #         ],
10477        #     },
10478        #     {
10479        #         "transcripts_column": "Ensembl_transcriptid",
10480        #         "transcripts_infos_columns": [
10481        #             "genename",
10482        #             "VARITY_R_score",
10483        #             "Aloft_pred",
10484        #         ],
10485        #     },
10486        # ],
10487
10488        # Init
10489        if temporary_tables is None:
10490            temporary_tables = []
10491        if annotation_fields is None:
10492            annotation_fields = []
10493
10494        # Variants table
10495        table_variants = self.get_table_variants()
10496
10497        for columns_map in columns_maps:
10498
10499            # Transcript column
10500            transcripts_column = columns_map.get("transcripts_column", None)
10501
10502            # Transcripts infos columns
10503            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10504
10505            # Transcripts infos columns rename
10506            column_rename = columns_map.get("column_rename", column_rename)
10507
10508            # Transcripts infos columns clean
10509            column_clean = columns_map.get("column_clean", column_clean)
10510
10511            # Transcripts infos columns case
10512            column_case = columns_map.get("column_case", column_case)
10513
10514            if transcripts_column is not None:
10515
10516                # Explode
10517                added_columns += self.explode_infos(
10518                    fields=[transcripts_column] + transcripts_infos_columns
10519                )
10520
10521                # View clauses
10522                clause_select_variants = []
10523                clause_select_tanscripts = []
10524                for field in [transcripts_column] + transcripts_infos_columns:
10525
10526                    # AS field
10527                    as_field = field
10528
10529                    # Rename
10530                    if column_rename:
10531                        as_field = column_rename.get(as_field, as_field)
10532
10533                    # Clean
10534                    if column_clean:
10535                        as_field = clean_annotation_field(as_field)
10536
10537                    # Case
10538                    if column_case:
10539                        if column_case.lower() in ["lower"]:
10540                            as_field = as_field.lower()
10541                        elif column_case.lower() in ["upper"]:
10542                            as_field = as_field.upper()
10543
10544                    # Clause select Variants
10545                    clause_select_variants.append(
10546                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10547                    )
10548
10549                    if field in [transcripts_column]:
10550                        clause_select_tanscripts.append(
10551                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10552                        )
10553                    else:
10554                        clause_select_tanscripts.append(
10555                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10556                        )
10557                        annotation_fields.append(as_field)
10558
10559                # Querey View
10560                query = f""" 
10561                    SELECT
10562                        "#CHROM", POS, REF, ALT, INFO,
10563                        "{transcripts_column}" AS 'transcript',
10564                        {", ".join(clause_select_tanscripts)}
10565                    FROM (
10566                        SELECT 
10567                            "#CHROM", POS, REF, ALT, INFO,
10568                            {", ".join(clause_select_variants)}
10569                        FROM {table_variants}
10570                        )
10571                    WHERE "{transcripts_column}" IS NOT NULL
10572                """
10573
10574                # Create temporary table
10575                temporary_table = transcripts_table + "".join(
10576                    random.choices(string.ascii_uppercase + string.digits, k=10)
10577                )
10578
10579                # Temporary_tables
10580                temporary_tables.append(temporary_table)
10581                query_view = f"""
10582                    CREATE TEMPORARY TABLE {temporary_table}
10583                    AS ({query})
10584                """
10585                self.execute_query(query=query_view)
10586
10587        return added_columns, temporary_tables, annotation_fields
10588
10589    def create_transcript_view_from_column_format(
10590        self,
10591        transcripts_table: str = "transcripts",
10592        column_formats: dict = {},
10593        temporary_tables: list = None,
10594        annotation_fields: list = None,
10595        column_rename: dict = {},
10596        column_clean: bool = False,
10597        column_case: str = None,
10598    ) -> tuple[list, list, list]:
10599        """
10600        The `create_transcript_view_from_column_format` function generates a transcript view based on
10601        specified column formats, adds additional columns and annotation fields, and returns the list of
10602        temporary tables and annotation fields.
10603
10604        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10605        of the table containing the transcripts data. This table will be used as the base table for
10606        creating the transcript view. The default value for this parameter is "transcripts", but you can
10607        provide a different table name if needed, defaults to transcripts
10608        :type transcripts_table: str (optional)
10609        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10610        about the columns to be used for creating the transcript view. Each entry in the dictionary
10611        specifies the mapping between a transcripts column and a transcripts infos column. This
10612        parameter allows you to define how the columns from the transcripts table should be transformed
10613        or mapped
10614        :type column_formats: dict
10615        :param temporary_tables: The `temporary_tables` parameter in the
10616        `create_transcript_view_from_column_format` function is a list that stores the names of
10617        temporary views created during the process of creating a transcript view from a column format.
10618        These temporary views are used to manipulate and extract data before generating the final
10619        transcript view
10620        :type temporary_tables: list
10621        :param annotation_fields: The `annotation_fields` parameter in the
10622        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10623        that are extracted from the temporary views created during the process. These annotation fields
10624        are obtained by querying the temporary views and extracting the column names excluding specific
10625        columns like `#CH
10626        :type annotation_fields: list
10627        :param column_rename: The `column_rename` parameter in the
10628        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10629        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10630        column names to new column names in this dictionary, you can rename specific columns during the
10631        process
10632        :type column_rename: dict
10633        :param column_clean: The `column_clean` parameter in the
10634        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10635        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10636        will be cleaned during the creation of the transcript view based on the specified column format,
10637        defaults to False
10638        :type column_clean: bool (optional)
10639        :param column_case: The `column_case` parameter in the
10640        `create_transcript_view_from_column_format` function is used to specify the case transformation
10641        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10642        to convert the column names to uppercase or lowercase, respectively
10643        :type column_case: str
10644        :return: The `create_transcript_view_from_column_format` function returns two lists:
10645        `temporary_tables` and `annotation_fields`.
10646        """
10647
10648        log.debug("Start transcrpts view creation from column format...")
10649
10650        #  "from_column_format": [
10651        #     {
10652        #         "transcripts_column": "ANN",
10653        #         "transcripts_infos_column": "Feature_ID",
10654        #     }
10655        # ],
10656
10657        # Init
10658        if temporary_tables is None:
10659            temporary_tables = []
10660        if annotation_fields is None:
10661            annotation_fields = []
10662
10663        for column_format in column_formats:
10664
10665            # annotation field and transcript annotation field
10666            annotation_field = column_format.get("transcripts_column", "ANN")
10667            transcript_annotation = column_format.get(
10668                "transcripts_infos_column", "Feature_ID"
10669            )
10670
10671            # Transcripts infos columns rename
10672            column_rename = column_format.get("column_rename", column_rename)
10673
10674            # Transcripts infos columns clean
10675            column_clean = column_format.get("column_clean", column_clean)
10676
10677            # Transcripts infos columns case
10678            column_case = column_format.get("column_case", column_case)
10679
10680            # Temporary View name
10681            temporary_view_name = transcripts_table + "".join(
10682                random.choices(string.ascii_uppercase + string.digits, k=10)
10683            )
10684
10685            # Create temporary view name
10686            temporary_view_name = self.annotation_format_to_table(
10687                uniquify=True,
10688                annotation_field=annotation_field,
10689                view_name=temporary_view_name,
10690                annotation_id=transcript_annotation,
10691                column_rename=column_rename,
10692                column_clean=column_clean,
10693                column_case=column_case,
10694            )
10695
10696            # Annotation fields
10697            if temporary_view_name:
10698                query_annotation_fields = f"""
10699                    SELECT *
10700                    FROM (
10701                        DESCRIBE SELECT *
10702                        FROM {temporary_view_name}
10703                        )
10704                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10705                """
10706                df_annotation_fields = self.get_query_to_df(
10707                    query=query_annotation_fields
10708                )
10709
10710                # Add temporary view and annotation fields
10711                temporary_tables.append(temporary_view_name)
10712                annotation_fields += list(set(df_annotation_fields["column_name"]))
10713
10714        return temporary_tables, annotation_fields
10715
10716    def create_transcript_view(
10717        self,
10718        transcripts_table: str = None,
10719        transcripts_table_drop: bool = True,
10720        param: dict = {},
10721    ) -> str:
10722        """
10723        The `create_transcript_view` function generates a transcript view by processing data from a
10724        specified table based on provided parameters and structural information.
10725
10726        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10727        is used to specify the name of the table that will store the final transcript view data. If a table
10728        name is not provided, the function will create a new table to store the transcript view data, and by
10729        default,, defaults to transcripts
10730        :type transcripts_table: str (optional)
10731        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10732        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10733        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10734        the function will drop the existing transcripts table if it exists, defaults to True
10735        :type transcripts_table_drop: bool (optional)
10736        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10737        contains information needed to create a transcript view. It includes details such as the structure
10738        of the transcripts, columns mapping, column formats, and other necessary information for generating
10739        the view. This parameter allows for flexibility and customization
10740        :type param: dict
10741        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10742        created or modified during the execution of the function.
10743        """
10744
10745        log.debug("Start transcripts view creation...")
10746
10747        # Default
10748        transcripts_table_default = "transcripts"
10749
10750        # Param
10751        if not param:
10752            param = self.get_param()
10753
10754        # Struct
10755        struct = param.get("transcripts", {}).get("struct", None)
10756
10757        # Transcript veresion
10758        transcript_id_remove_version = param.get("transcripts", {}).get(
10759            "transcript_id_remove_version", False
10760        )
10761
10762        # Transcripts mapping
10763        transcript_id_mapping_file = param.get("transcripts", {}).get(
10764            "transcript_id_mapping_file", None
10765        )
10766
10767        # Transcripts mapping
10768        transcript_id_mapping_force = param.get("transcripts", {}).get(
10769            "transcript_id_mapping_force", None
10770        )
10771
10772        if struct:
10773
10774            # Transcripts table
10775            if transcripts_table is None:
10776                transcripts_table = param.get("transcripts", {}).get(
10777                    "table", transcripts_table_default
10778                )
10779
10780            # added_columns
10781            added_columns = []
10782
10783            # Temporary tables
10784            temporary_tables = []
10785
10786            # Annotation fields
10787            annotation_fields = []
10788
10789            # from columns map
10790            columns_maps = struct.get("from_columns_map", [])
10791            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10792                self.create_transcript_view_from_columns_map(
10793                    transcripts_table=transcripts_table,
10794                    columns_maps=columns_maps,
10795                    added_columns=added_columns,
10796                    temporary_tables=temporary_tables,
10797                    annotation_fields=annotation_fields,
10798                )
10799            )
10800            added_columns += added_columns_tmp
10801            temporary_tables += temporary_tables_tmp
10802            annotation_fields += annotation_fields_tmp
10803
10804            # from column format
10805            column_formats = struct.get("from_column_format", [])
10806            temporary_tables_tmp, annotation_fields_tmp = (
10807                self.create_transcript_view_from_column_format(
10808                    transcripts_table=transcripts_table,
10809                    column_formats=column_formats,
10810                    temporary_tables=temporary_tables,
10811                    annotation_fields=annotation_fields,
10812                )
10813            )
10814            temporary_tables += temporary_tables_tmp
10815            annotation_fields += annotation_fields_tmp
10816
10817            # Remove some specific fields/column
10818            annotation_fields = list(set(annotation_fields))
10819            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10820                if field in annotation_fields:
10821                    annotation_fields.remove(field)
10822
10823            # Merge temporary tables query
10824            query_merge = ""
10825            for temporary_table in list(set(temporary_tables)):
10826
10827                # First temporary table
10828                if not query_merge:
10829                    query_merge = f"""
10830                        SELECT * FROM {temporary_table}
10831                    """
10832                # other temporary table (using UNION)
10833                else:
10834                    query_merge += f"""
10835                        UNION BY NAME SELECT * FROM {temporary_table}
10836                    """
10837
10838            # transcript table tmp
10839            transcript_table_tmp = "transcripts_tmp"
10840            transcript_table_tmp2 = "transcripts_tmp2"
10841            transcript_table_tmp3 = "transcripts_tmp3"
10842
10843            # Merge on transcript
10844            query_merge_on_transcripts_annotation_fields = []
10845
10846            # Add transcript list
10847            query_merge_on_transcripts_annotation_fields.append(
10848                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10849            )
10850
10851            # Aggregate all annotations fields
10852            for annotation_field in set(annotation_fields):
10853                query_merge_on_transcripts_annotation_fields.append(
10854                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10855                )
10856
10857            # Transcripts mapping
10858            if transcript_id_mapping_file:
10859
10860                # Transcript dataframe
10861                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10862                transcript_id_mapping_dataframe = transcripts_file_to_df(
10863                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10864                )
10865
10866                # Transcript version remove
10867                if transcript_id_remove_version:
10868                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10869                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10870                    query_left_join = f"""
10871                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10872                    """
10873                else:
10874                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10875                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10876                    query_left_join = f"""
10877                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10878                    """
10879
10880                # Transcript column for group by merge
10881                query_transcript_merge_group_by = """
10882                        CASE
10883                            WHEN transcript_mapped NOT IN ('')
10884                            THEN split_part(transcript_mapped, '.', 1)
10885                            ELSE split_part(transcript_original, '.', 1)
10886                        END
10887                    """
10888
10889                # Merge query
10890                transcripts_tmp2_query = f"""
10891                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10892                    FROM ({query_merge}) AS {transcript_table_tmp}
10893                    {query_left_join}
10894                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10895                """
10896
10897                # Retrive columns after mege
10898                transcripts_tmp2_describe_query = f"""
10899                    DESCRIBE {transcripts_tmp2_query}
10900                """
10901                transcripts_tmp2_describe_list = list(
10902                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10903                        "column_name"
10904                    ]
10905                )
10906
10907                # Create list of columns for select clause
10908                transcripts_tmp2_describe_select_clause = []
10909                for field in transcripts_tmp2_describe_list:
10910                    if field not in [
10911                        "#CHROM",
10912                        "POS",
10913                        "REF",
10914                        "ALT",
10915                        "INFO",
10916                        "transcript_mapped",
10917                    ]:
10918                        as_field = field
10919                        if field in ["transcript_original"]:
10920                            as_field = "transcripts_mapped"
10921                        transcripts_tmp2_describe_select_clause.append(
10922                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
10923                        )
10924
10925                # Merge with mapping
10926                query_merge_on_transcripts = f"""
10927                    SELECT
10928                        "#CHROM", POS, REF, ALT, INFO,
10929                        CASE
10930                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
10931                            THEN ANY_VALUE(transcript_mapped)
10932                            ELSE ANY_VALUE(transcript_original)
10933                        END AS transcript,
10934                        {", ".join(transcripts_tmp2_describe_select_clause)}
10935                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
10936                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
10937                        {query_transcript_merge_group_by}
10938                """
10939
10940                # Add transcript filter from mapping file
10941                if transcript_id_mapping_force:
10942                    query_merge_on_transcripts = f"""
10943                        SELECT *
10944                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
10945                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
10946                    """
10947
10948            # No transcript mapping
10949            else:
10950
10951                # Remove transcript version
10952                if transcript_id_remove_version:
10953                    query_transcript_column = f"""
10954                        split_part({transcript_table_tmp}.transcript, '.', 1)
10955                    """
10956                else:
10957                    query_transcript_column = """
10958                        transcript
10959                    """
10960
10961                # Query sections
10962                query_transcript_column_select = (
10963                    f"{query_transcript_column} AS transcript"
10964                )
10965                query_transcript_column_group_by = query_transcript_column
10966
10967                # Query for transcripts view
10968                query_merge_on_transcripts = f"""
10969                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
10970                    FROM ({query_merge}) AS {transcript_table_tmp}
10971                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
10972                """
10973
10974            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
10975
10976            # Drop transcript view is necessary
10977            if transcripts_table_drop:
10978                query_drop = f"""
10979                    DROP TABLE IF EXISTS {transcripts_table};
10980                """
10981                self.execute_query(query=query_drop)
10982
10983            # Merge and create transcript view
10984            query_create_view = f"""
10985                CREATE TABLE IF NOT EXISTS {transcripts_table}
10986                AS {query_merge_on_transcripts}
10987            """
10988            self.execute_query(query=query_create_view)
10989
10990            # Remove added columns
10991            for added_column in added_columns:
10992                self.drop_column(column=added_column)
10993
10994        else:
10995
10996            transcripts_table = None
10997
10998        return transcripts_table
10999
11000    def annotation_format_to_table(
11001        self,
11002        uniquify: bool = True,
11003        annotation_field: str = "ANN",
11004        annotation_id: str = "Feature_ID",
11005        view_name: str = "transcripts",
11006        column_rename: dict = {},
11007        column_clean: bool = False,
11008        column_case: str = None,
11009    ) -> str:
11010        """
11011        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11012        structured table format, ensuring unique values and creating a temporary table for further
11013        processing or analysis.
11014
11015        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11016        unique values in the output or not. If set to `True`, the function will make sure that the
11017        output values are unique, defaults to True
11018        :type uniquify: bool (optional)
11019        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11020        that contains the annotation information for each variant. This field is used to extract the
11021        annotation details for further processing in the function. By default, it is set to "ANN",
11022        defaults to ANN
11023        :type annotation_field: str (optional)
11024        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11025        is used to specify the identifier for the annotation feature. This identifier will be used as a
11026        column name in the resulting table or view that is created based on the annotation data. It
11027        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11028        :type annotation_id: str (optional)
11029        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11030        to specify the name of the temporary table that will be created to store the transformed
11031        annotation data. This table will hold the extracted information from the annotation field in a
11032        structured format for further processing or analysis. By default,, defaults to transcripts
11033        :type view_name: str (optional)
11034        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11035        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11036        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11037        created based on the annotation data. This feature enables
11038        :type column_rename: dict
11039        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11040        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11041        If set to `True`, the function will clean the annotation field before further processing. This
11042        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11043        to False
11044        :type column_clean: bool (optional)
11045        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11046        used to specify the case transformation to be applied to the column names extracted from the
11047        annotation data. It allows you to set the case of the column names to either lowercase or
11048        uppercase for consistency or other specific requirements during the conversion
11049        :type column_case: str
11050        :return: The function `annotation_format_to_table` is returning the name of the view created,
11051        which is stored in the variable `view_name`.
11052        """
11053
11054        # Annotation field
11055        annotation_format = "annotation_explode"
11056
11057        # Transcript annotation
11058        if column_rename:
11059            annotation_id = column_rename.get(annotation_id, annotation_id)
11060
11061        if column_clean:
11062            annotation_id = clean_annotation_field(annotation_id)
11063
11064        # Prefix
11065        prefix = self.get_explode_infos_prefix()
11066        if prefix:
11067            prefix = "INFO/"
11068
11069        # Annotation fields
11070        annotation_infos = prefix + annotation_field
11071        annotation_format_infos = prefix + annotation_format
11072
11073        # Variants table
11074        table_variants = self.get_table_variants()
11075
11076        # Header
11077        vcf_reader = self.get_header()
11078
11079        # Add columns
11080        added_columns = []
11081
11082        # Explode HGVS field in column
11083        added_columns += self.explode_infos(fields=[annotation_field])
11084
11085        if annotation_field in vcf_reader.infos:
11086
11087            # Extract ANN header
11088            ann_description = vcf_reader.infos[annotation_field].desc
11089            pattern = r"'(.+?)'"
11090            match = re.search(pattern, ann_description)
11091            if match:
11092                ann_header_match = match.group(1).split(" | ")
11093                ann_header = []
11094                ann_header_desc = {}
11095                for i in range(len(ann_header_match)):
11096                    ann_header_info = "".join(
11097                        char for char in ann_header_match[i] if char.isalnum()
11098                    )
11099                    ann_header.append(ann_header_info)
11100                    ann_header_desc[ann_header_info] = ann_header_match[i]
11101                if not ann_header_desc:
11102                    raise ValueError("Invalid header description format")
11103            else:
11104                raise ValueError("Invalid header description format")
11105
11106            # Create variant id
11107            variant_id_column = self.get_variant_id_column()
11108            added_columns += [variant_id_column]
11109
11110            # Create dataframe
11111            dataframe_annotation_format = self.get_query_to_df(
11112                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11113            )
11114
11115            # Create annotation columns
11116            dataframe_annotation_format[
11117                annotation_format_infos
11118            ] = dataframe_annotation_format[annotation_infos].apply(
11119                lambda x: explode_annotation_format(
11120                    annotation=str(x),
11121                    uniquify=uniquify,
11122                    output_format="JSON",
11123                    prefix="",
11124                    header=list(ann_header_desc.values()),
11125                )
11126            )
11127
11128            # Find keys
11129            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11130            df_keys = self.get_query_to_df(query=query_json)
11131
11132            # Check keys
11133            query_json_key = []
11134            for _, row in df_keys.iterrows():
11135
11136                # Key
11137                key = row.iloc[0]
11138                key_clean = key
11139
11140                # key rename
11141                if column_rename:
11142                    key_clean = column_rename.get(key_clean, key_clean)
11143
11144                # key clean
11145                if column_clean:
11146                    key_clean = clean_annotation_field(key_clean)
11147
11148                # Key case
11149                if column_case:
11150                    if column_case.lower() in ["lower"]:
11151                        key_clean = key_clean.lower()
11152                    elif column_case.lower() in ["upper"]:
11153                        key_clean = key_clean.upper()
11154
11155                # Type
11156                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11157
11158                # Get DataFrame from query
11159                df_json_type = self.get_query_to_df(query=query_json_type)
11160
11161                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11162                with pd.option_context("future.no_silent_downcasting", True):
11163                    df_json_type.fillna(value="", inplace=True)
11164                    replace_dict = {None: np.nan, "": np.nan}
11165                    df_json_type.replace(replace_dict, inplace=True)
11166                    df_json_type.dropna(inplace=True)
11167
11168                # Detect column type
11169                column_type = detect_column_type(df_json_type[key_clean])
11170
11171                # Append
11172                query_json_key.append(
11173                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11174                )
11175
11176            # Create view
11177            query_view = f"""
11178                CREATE TEMPORARY TABLE {view_name}
11179                AS (
11180                    SELECT *, {annotation_id} AS 'transcript'
11181                    FROM (
11182                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11183                        FROM dataframe_annotation_format
11184                        )
11185                    );
11186            """
11187            self.execute_query(query=query_view)
11188
11189        else:
11190
11191            # Return None
11192            view_name = None
11193
11194        # Remove added columns
11195        for added_column in added_columns:
11196            self.drop_column(column=added_column)
11197
11198        return view_name
11199
11200    def transcript_view_to_variants(
11201        self,
11202        transcripts_table: str = None,
11203        transcripts_column_id: str = None,
11204        transcripts_info_json: str = None,
11205        transcripts_info_field_json: str = None,
11206        transcripts_info_format: str = None,
11207        transcripts_info_field_format: str = None,
11208        param: dict = {},
11209    ) -> bool:
11210        """
11211        The `transcript_view_to_variants` function updates a variants table with information from
11212        transcripts in JSON format.
11213
11214        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11215        table containing the transcripts data. If this parameter is not provided, the function will
11216        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11217        :type transcripts_table: str
11218        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11219        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11220        identifier is used to match transcripts with variants in the database
11221        :type transcripts_column_id: str
11222        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11223        of the column in the variants table where the transcripts information will be stored in JSON
11224        format. This parameter allows you to define the column in the variants table that will hold the
11225        JSON-formatted information about transcripts
11226        :type transcripts_info_json: str
11227        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11228        specify the field in the VCF header that will contain information about transcripts in JSON
11229        format. This field will be added to the VCF header as an INFO field with the specified name
11230        :type transcripts_info_field_json: str
11231        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11232        format of the information about transcripts that will be stored in the variants table. This
11233        format can be used to define how the transcript information will be structured or displayed
11234        within the variants table
11235        :type transcripts_info_format: str
11236        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11237        specify the field in the VCF header that will contain information about transcripts in a
11238        specific format. This field will be added to the VCF header as an INFO field with the specified
11239        name
11240        :type transcripts_info_field_format: str
11241        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11242        that contains various configuration settings related to transcripts. It is used to provide
11243        default values for certain parameters if they are not explicitly provided when calling the
11244        method. The `param` dictionary can be passed as an argument
11245        :type param: dict
11246        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11247        if the operation is successful and `False` if certain conditions are not met.
11248        """
11249
11250        msg_info_prefix = "Start transcripts view to variants annotations"
11251
11252        log.debug(f"{msg_info_prefix}...")
11253
11254        # Default
11255        transcripts_table_default = "transcripts"
11256        transcripts_column_id_default = "transcript"
11257        transcripts_info_json_default = None
11258        transcripts_info_format_default = None
11259        transcripts_info_field_json_default = None
11260        transcripts_info_field_format_default = None
11261
11262        # Param
11263        if not param:
11264            param = self.get_param()
11265
11266        # Transcripts table
11267        if transcripts_table is None:
11268            transcripts_table = param.get("transcripts", {}).get(
11269                "table", transcripts_table_default
11270            )
11271
11272        # Transcripts column ID
11273        if transcripts_column_id is None:
11274            transcripts_column_id = param.get("transcripts", {}).get(
11275                "column_id", transcripts_column_id_default
11276            )
11277
11278        # Transcripts info json
11279        if transcripts_info_json is None:
11280            transcripts_info_json = param.get("transcripts", {}).get(
11281                "transcripts_info_json", transcripts_info_json_default
11282            )
11283
11284        # Transcripts info field JSON
11285        if transcripts_info_field_json is None:
11286            transcripts_info_field_json = param.get("transcripts", {}).get(
11287                "transcripts_info_field_json", transcripts_info_field_json_default
11288            )
11289        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11290        #     transcripts_info_json = transcripts_info_field_json
11291
11292        # Transcripts info format
11293        if transcripts_info_format is None:
11294            transcripts_info_format = param.get("transcripts", {}).get(
11295                "transcripts_info_format", transcripts_info_format_default
11296            )
11297
11298        # Transcripts info field FORMAT
11299        if transcripts_info_field_format is None:
11300            transcripts_info_field_format = param.get("transcripts", {}).get(
11301                "transcripts_info_field_format", transcripts_info_field_format_default
11302            )
11303        # if (
11304        #     transcripts_info_field_format is not None
11305        #     and transcripts_info_format is None
11306        # ):
11307        #     transcripts_info_format = transcripts_info_field_format
11308
11309        # Variants table
11310        table_variants = self.get_table_variants()
11311
11312        # Check info columns param
11313        if (
11314            transcripts_info_json is None
11315            and transcripts_info_field_json is None
11316            and transcripts_info_format is None
11317            and transcripts_info_field_format is None
11318        ):
11319            return False
11320
11321        # Transcripts infos columns
11322        query_transcripts_infos_columns = f"""
11323            SELECT *
11324            FROM (
11325                DESCRIBE SELECT * FROM {transcripts_table}
11326                )
11327            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11328        """
11329        transcripts_infos_columns = list(
11330            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11331        )
11332
11333        # View results
11334        clause_select = []
11335        clause_to_json = []
11336        clause_to_format = []
11337        for field in transcripts_infos_columns:
11338            # Do not consider INFO field for export into fields
11339            if field not in ["INFO"]:
11340                clause_select.append(
11341                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11342                )
11343                clause_to_json.append(f""" '{field}': "{field}" """)
11344                clause_to_format.append(f""" "{field}" """)
11345
11346        # Update
11347        update_set_json = []
11348        update_set_format = []
11349
11350        # VCF header
11351        vcf_reader = self.get_header()
11352
11353        # Transcripts to info column in JSON
11354        if transcripts_info_json:
11355
11356            # Create column on variants table
11357            self.add_column(
11358                table_name=table_variants,
11359                column_name=transcripts_info_json,
11360                column_type="JSON",
11361                default_value=None,
11362                drop=False,
11363            )
11364
11365            # Add header
11366            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11367                transcripts_info_json,
11368                ".",
11369                "String",
11370                "Transcripts in JSON format",
11371                "unknwon",
11372                "unknwon",
11373                self.code_type_map["String"],
11374            )
11375
11376            # Add to update
11377            update_set_json.append(
11378                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11379            )
11380
11381        # Transcripts to info field in JSON
11382        if transcripts_info_field_json:
11383
11384            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11385
11386            # Add to update
11387            update_set_json.append(
11388                f""" 
11389                    INFO = concat(
11390                            CASE
11391                                WHEN INFO NOT IN ('', '.')
11392                                THEN INFO
11393                                ELSE ''
11394                            END,
11395                            CASE
11396                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11397                                THEN concat(
11398                                    ';{transcripts_info_field_json}=',
11399                                    t.{transcripts_info_json}
11400                                )
11401                                ELSE ''
11402                            END
11403                            )
11404                """
11405            )
11406
11407            # Add header
11408            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11409                transcripts_info_field_json,
11410                ".",
11411                "String",
11412                "Transcripts in JSON format",
11413                "unknwon",
11414                "unknwon",
11415                self.code_type_map["String"],
11416            )
11417
11418        if update_set_json:
11419
11420            # Update query
11421            query_update = f"""
11422                UPDATE {table_variants}
11423                    SET {", ".join(update_set_json)}
11424                FROM
11425                (
11426                    SELECT
11427                        "#CHROM", POS, REF, ALT,
11428                            concat(
11429                            '{{',
11430                            string_agg(
11431                                '"' || "{transcripts_column_id}" || '":' ||
11432                                to_json(json_output)
11433                            ),
11434                            '}}'
11435                            )::JSON AS {transcripts_info_json}
11436                    FROM
11437                        (
11438                        SELECT
11439                            "#CHROM", POS, REF, ALT,
11440                            "{transcripts_column_id}",
11441                            to_json(
11442                                {{{",".join(clause_to_json)}}}
11443                            )::JSON AS json_output
11444                        FROM
11445                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11446                        WHERE "{transcripts_column_id}" IS NOT NULL
11447                        )
11448                    GROUP BY "#CHROM", POS, REF, ALT
11449                ) AS t
11450                WHERE {table_variants}."#CHROM" = t."#CHROM"
11451                    AND {table_variants}."POS" = t."POS"
11452                    AND {table_variants}."REF" = t."REF"
11453                    AND {table_variants}."ALT" = t."ALT"
11454            """
11455
11456            self.execute_query(query=query_update)
11457
11458        # Transcripts to info column in FORMAT
11459        if transcripts_info_format:
11460
11461            # Create column on variants table
11462            self.add_column(
11463                table_name=table_variants,
11464                column_name=transcripts_info_format,
11465                column_type="VARCHAR",
11466                default_value=None,
11467                drop=False,
11468            )
11469
11470            # Add header
11471            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11472                transcripts_info_format,
11473                ".",
11474                "String",
11475                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11476                "unknwon",
11477                "unknwon",
11478                self.code_type_map["String"],
11479            )
11480
11481            # Add to update
11482            update_set_format.append(
11483                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11484            )
11485
11486        else:
11487
11488            # Set variable for internal queries
11489            transcripts_info_format = "transcripts_info_format"
11490
11491        # Transcripts to info field in JSON
11492        if transcripts_info_field_format:
11493
11494            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11495
11496            # Add to update
11497            update_set_format.append(
11498                f""" 
11499                    INFO = concat(
11500                            CASE
11501                                WHEN INFO NOT IN ('', '.')
11502                                THEN INFO
11503                                ELSE ''
11504                            END,
11505                            CASE
11506                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11507                                THEN concat(
11508                                    ';{transcripts_info_field_format}=',
11509                                    t.{transcripts_info_format}
11510                                )
11511                                ELSE ''
11512                            END
11513                            )
11514                """
11515            )
11516
11517            # Add header
11518            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11519                transcripts_info_field_format,
11520                ".",
11521                "String",
11522                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11523                "unknwon",
11524                "unknwon",
11525                self.code_type_map["String"],
11526            )
11527
11528        if update_set_format:
11529
11530            # Update query
11531            query_update = f"""
11532                UPDATE {table_variants}
11533                    SET {", ".join(update_set_format)}
11534                FROM
11535                (
11536                    SELECT
11537                        "#CHROM", POS, REF, ALT,
11538                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11539                    FROM 
11540                        (
11541                        SELECT
11542                            "#CHROM", POS, REF, ALT,
11543                            "{transcripts_column_id}",
11544                            concat(
11545                                "{transcripts_column_id}",
11546                                '|',
11547                                {", '|', ".join(clause_to_format)}
11548                            ) AS {transcripts_info_format}
11549                        FROM
11550                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11551                        )
11552                    GROUP BY "#CHROM", POS, REF, ALT
11553                ) AS t
11554                WHERE {table_variants}."#CHROM" = t."#CHROM"
11555                    AND {table_variants}."POS" = t."POS"
11556                    AND {table_variants}."REF" = t."REF"
11557                    AND {table_variants}."ALT" = t."ALT"
11558            """
11559
11560            self.execute_query(query=query_update)
11561
11562        return True
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
38    def __init__(
39        self,
40        conn=None,
41        input: str = None,
42        output: str = None,
43        config: dict = {},
44        param: dict = {},
45        load: bool = False,
46    ) -> None:
47        """
48        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
49        header
50
51        :param conn: the connection to the database
52        :param input: the input file
53        :param output: the output file
54        :param config: a dictionary containing the configuration of the model
55        :param param: a dictionary containing the parameters of the model
56        """
57
58        # Init variables
59        self.init_variables()
60
61        # Input
62        self.set_input(input)
63
64        # Config
65        self.set_config(config)
66
67        # Param
68        self.set_param(param)
69
70        # Output
71        self.set_output(output)
72
73        # connexion
74        self.set_connexion(conn)
75
76        # Header
77        self.set_header()
78
79        # Samples
80        self.set_samples()
81
82        # Load data
83        if load:
84            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_samples(self, samples: list = None) -> list:
 86    def set_samples(self, samples: list = None) -> list:
 87        """
 88        The function `set_samples` sets the samples attribute of an object to a provided list or
 89        retrieves it from a parameter dictionary.
 90
 91        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
 92        input and sets the `samples` attribute of the class to the provided list. If no samples are
 93        provided, it tries to get the samples from the class's parameters using the `get_param` method
 94        :type samples: list
 95        :return: The `samples` list is being returned.
 96        """
 97
 98        if not samples:
 99            samples = self.get_param().get("samples", {}).get("list", None)
100
101        self.samples = samples
102
103        return samples

The function set_samples sets the samples attribute of an object to a provided list or retrieves it from a parameter dictionary.

Parameters
  • samples: The set_samples method is a method of a class that takes a list of samples as input and sets the samples attribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using the get_param method
Returns

The samples list is being returned.

def get_samples(self) -> list:
105    def get_samples(self) -> list:
106        """
107        This function returns a list of samples.
108        :return: The `get_samples` method is returning the `samples` attribute of the object.
109        """
110
111        return self.samples

This function returns a list of samples.

Returns

The get_samples method is returning the samples attribute of the object.

def get_samples_check(self) -> bool:
113    def get_samples_check(self) -> bool:
114        """
115        This function returns the value of the "check" key within the "samples" dictionary retrieved
116        from the parameters.
117        :return: The method `get_samples_check` is returning the value of the key "check" inside the
118        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
119        method. If the key "check" is not found, it will return `False`.
120        """
121
122        return self.get_param().get("samples", {}).get("check", True)

This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.

Returns

The method get_samples_check is returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by the get_param() method. If the key "check" is not found, it will return False.

def set_input(self, input: str = None) -> None:
124    def set_input(self, input: str = None) -> None:
125        """
126        The function `set_input` takes a file name as input, extracts the name and extension, and sets
127        attributes in the class accordingly.
128
129        :param input: The `set_input` method in the provided code snippet is used to set attributes
130        related to the input file. Here's a breakdown of the parameters and their usage in the method:
131        :type input: str
132        """
133
134        if input and not isinstance(input, str):
135            try:
136                self.input = input.name
137            except:
138                log.error(f"Input file '{input} in bad format")
139                raise ValueError(f"Input file '{input} in bad format")
140        else:
141            self.input = input
142
143        # Input format
144        if input:
145            input_name, input_extension = os.path.splitext(self.input)
146            self.input_name = input_name
147            self.input_extension = input_extension
148            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
150    def set_config(self, config: dict) -> None:
151        """
152        The set_config function takes a config object and assigns it as the configuration object for the
153        class.
154
155        :param config: The `config` parameter in the `set_config` function is a dictionary object that
156        contains configuration settings for the class. When you call the `set_config` function with a
157        dictionary object as the argument, it will set that dictionary as the configuration object for
158        the class
159        :type config: dict
160        """
161
162        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
164    def set_param(self, param: dict) -> None:
165        """
166        This function sets a parameter object for the class based on the input dictionary.
167
168        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
169        as the `param` attribute of the class instance
170        :type param: dict
171        """
172
173        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
175    def init_variables(self) -> None:
176        """
177        This function initializes the variables that will be used in the rest of the class
178        """
179
180        self.prefix = "howard"
181        self.table_variants = "variants"
182        self.dataframe = None
183
184        self.comparison_map = {
185            "gt": ">",
186            "gte": ">=",
187            "lt": "<",
188            "lte": "<=",
189            "equals": "=",
190            "contains": "SIMILAR TO",
191        }
192
193        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
194
195        self.code_type_map_to_sql = {
196            "Integer": "INTEGER",
197            "String": "VARCHAR",
198            "Float": "FLOAT",
199            "Flag": "VARCHAR",
200        }
201
202        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
204    def get_indexing(self) -> bool:
205        """
206        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
207        returns False.
208        :return: The value of the indexing parameter.
209        """
210
211        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
213    def get_connexion_config(self) -> dict:
214        """
215        The function `get_connexion_config` returns a dictionary containing the configuration for a
216        connection, including the number of threads and memory limit.
217        :return: a dictionary containing the configuration for the Connexion library.
218        """
219
220        # config
221        config = self.get_config()
222
223        # Connexion config
224        connexion_config = {}
225        threads = self.get_threads()
226
227        # Threads
228        if threads:
229            connexion_config["threads"] = threads
230
231        # Memory
232        # if config.get("memory", None):
233        #     connexion_config["memory_limit"] = config.get("memory")
234        if self.get_memory():
235            connexion_config["memory_limit"] = self.get_memory()
236
237        # Temporary directory
238        if config.get("tmp", None):
239            connexion_config["temp_directory"] = config.get("tmp")
240
241        # Access
242        if config.get("access", None):
243            access = config.get("access")
244            if access in ["RO"]:
245                access = "READ_ONLY"
246            elif access in ["RW"]:
247                access = "READ_WRITE"
248            connexion_db = self.get_connexion_db()
249            if connexion_db in ":memory:":
250                access = "READ_WRITE"
251            connexion_config["access_mode"] = access
252
253        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
255    def get_duckdb_settings(self) -> dict:
256        """
257        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
258        string.
259        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
260        """
261
262        # config
263        config = self.get_config()
264
265        # duckdb settings
266        duckdb_settings_dict = {}
267        if config.get("duckdb_settings", None):
268            duckdb_settings = config.get("duckdb_settings")
269            duckdb_settings = full_path(duckdb_settings)
270            # duckdb setting is a file
271            if os.path.exists(duckdb_settings):
272                with open(duckdb_settings) as json_file:
273                    duckdb_settings_dict = yaml.safe_load(json_file)
274            # duckdb settings is a string
275            else:
276                duckdb_settings_dict = json.loads(duckdb_settings)
277
278        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
280    def set_connexion_db(self) -> str:
281        """
282        The function `set_connexion_db` returns the appropriate database connection string based on the
283        input format and connection type.
284        :return: the value of the variable `connexion_db`.
285        """
286
287        # Default connexion db
288        default_connexion_db = ":memory:"
289
290        # Find connexion db
291        if self.get_input_format() in ["db", "duckdb"]:
292            connexion_db = self.get_input()
293        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
294            connexion_db = default_connexion_db
295        elif self.get_connexion_type() in ["tmpfile"]:
296            tmp_name = tempfile.mkdtemp(
297                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
298            )
299            connexion_db = f"{tmp_name}/tmp.db"
300        elif self.get_connexion_type() != "":
301            connexion_db = self.get_connexion_type()
302        else:
303            connexion_db = default_connexion_db
304
305        # Set connexion db
306        self.connexion_db = connexion_db
307
308        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
310    def set_connexion(self, conn) -> None:
311        """
312        The function `set_connexion` creates a connection to a database, with options for different
313        database formats and settings.
314
315        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
316        database. If a connection is not provided, a new connection to an in-memory database is created.
317        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
318        sqlite
319        """
320
321        # Connexion db
322        connexion_db = self.set_connexion_db()
323
324        # Connexion config
325        connexion_config = self.get_connexion_config()
326
327        # Connexion format
328        connexion_format = self.get_config().get("connexion_format", "duckdb")
329        # Set connexion format
330        self.connexion_format = connexion_format
331
332        # Connexion
333        if not conn:
334            if connexion_format in ["duckdb"]:
335                conn = duckdb.connect(connexion_db, config=connexion_config)
336                # duckDB settings
337                duckdb_settings = self.get_duckdb_settings()
338                if duckdb_settings:
339                    for setting in duckdb_settings:
340                        setting_value = duckdb_settings.get(setting)
341                        if isinstance(setting_value, str):
342                            setting_value = f"'{setting_value}'"
343                        conn.execute(f"PRAGMA {setting}={setting_value};")
344            elif connexion_format in ["sqlite"]:
345                conn = sqlite3.connect(connexion_db)
346
347        # Set connexion
348        self.conn = conn
349
350        # Log
351        log.debug(f"connexion_format: {connexion_format}")
352        log.debug(f"connexion_db: {connexion_db}")
353        log.debug(f"connexion config: {connexion_config}")
354        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
356    def set_output(self, output: str = None) -> None:
357        """
358        The `set_output` function in Python sets the output file based on the input or a specified key
359        in the config file, extracting the output name, extension, and format.
360
361        :param output: The `output` parameter in the `set_output` method is used to specify the name of
362        the output file. If the config file has an 'output' key, the method sets the output to the value
363        of that key. If no output is provided, it sets the output to `None`
364        :type output: str
365        """
366
367        if output and not isinstance(output, str):
368            self.output = output.name
369        else:
370            self.output = output
371
372        # Output format
373        if self.output:
374            output_name, output_extension = os.path.splitext(self.output)
375            self.output_name = output_name
376            self.output_extension = output_extension
377            self.output_format = self.output_extension.replace(".", "")
378        else:
379            self.output_name = None
380            self.output_extension = None
381            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
383    def set_header(self) -> None:
384        """
385        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
386        """
387
388        input_file = self.get_input()
389        default_header_list = [
390            "##fileformat=VCFv4.2",
391            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
392        ]
393
394        # Full path
395        input_file = full_path(input_file)
396
397        if input_file:
398
399            input_format = self.get_input_format()
400            input_compressed = self.get_input_compressed()
401            config = self.get_config()
402            header_list = default_header_list
403            if input_format in [
404                "vcf",
405                "hdr",
406                "tsv",
407                "csv",
408                "psv",
409                "parquet",
410                "db",
411                "duckdb",
412            ]:
413                # header provided in param
414                if config.get("header_file", None):
415                    with open(config.get("header_file"), "rt") as f:
416                        header_list = self.read_vcf_header(f)
417                # within a vcf file format (header within input file itsself)
418                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
419                    # within a compressed vcf file format (.vcf.gz)
420                    if input_compressed:
421                        with bgzf.open(input_file, "rt") as f:
422                            header_list = self.read_vcf_header(f)
423                    # within an uncompressed vcf file format (.vcf)
424                    else:
425                        with open(input_file, "rt") as f:
426                            header_list = self.read_vcf_header(f)
427                # header provided in default external file .hdr
428                elif os.path.exists((input_file + ".hdr")):
429                    with open(input_file + ".hdr", "rt") as f:
430                        header_list = self.read_vcf_header(f)
431                else:
432                    try:  # Try to get header info fields and file columns
433
434                        with tempfile.TemporaryDirectory() as tmpdir:
435
436                            # Create database
437                            db_for_header = Database(database=input_file)
438
439                            # Get header columns for infos fields
440                            db_header_from_columns = (
441                                db_for_header.get_header_from_columns()
442                            )
443
444                            # Get real columns in the file
445                            db_header_columns = db_for_header.get_columns()
446
447                            # Write header file
448                            header_file_tmp = os.path.join(tmpdir, "header")
449                            f = open(header_file_tmp, "w")
450                            vcf.Writer(f, db_header_from_columns)
451                            f.close()
452
453                            # Replace #CHROM line with rel columns
454                            header_list = db_for_header.read_header_file(
455                                header_file=header_file_tmp
456                            )
457                            header_list[-1] = "\t".join(db_header_columns)
458
459                    except:
460
461                        log.warning(
462                            f"No header for file {input_file}. Set as default VCF header"
463                        )
464                        header_list = default_header_list
465
466            else:  # try for unknown format ?
467
468                log.error(f"Input file format '{input_format}' not available")
469                raise ValueError(f"Input file format '{input_format}' not available")
470
471            if not header_list:
472                header_list = default_header_list
473
474            # header as list
475            self.header_list = header_list
476
477            # header as VCF object
478            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
479
480        else:
481
482            self.header_list = None
483            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
485    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
486        """
487        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
488        DataFrame based on the connection format.
489
490        :param query: The `query` parameter in the `get_query_to_df` function is a string that
491        represents the SQL query you want to execute. This query will be used to fetch data from a
492        database and convert it into a pandas DataFrame
493        :type query: str
494        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
495        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
496        function will only fetch up to that number of rows from the database query result. If no limit
497        is specified,
498        :type limit: int
499        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
500        """
501
502        # Connexion format
503        connexion_format = self.get_connexion_format()
504
505        # Limit in query
506        if limit:
507            pd.set_option("display.max_rows", limit)
508            if connexion_format in ["duckdb"]:
509                df = (
510                    self.conn.execute(query)
511                    .fetch_record_batch(limit)
512                    .read_next_batch()
513                    .to_pandas()
514                )
515            elif connexion_format in ["sqlite"]:
516                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
517
518        # Full query
519        else:
520            if connexion_format in ["duckdb"]:
521                df = self.conn.execute(query).df()
522            elif connexion_format in ["sqlite"]:
523                df = pd.read_sql_query(query, self.conn)
524
525        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
527    def get_overview(self) -> None:
528        """
529        The function prints the input, output, config, and dataframe of the current object
530        """
531        table_variants_from = self.get_table_variants(clause="from")
532        sql_columns = self.get_header_columns_as_sql()
533        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
534        df = self.get_query_to_df(sql_query_export)
535        log.info(
536            "Input:  "
537            + str(self.get_input())
538            + " ["
539            + str(str(self.get_input_format()))
540            + "]"
541        )
542        log.info(
543            "Output: "
544            + str(self.get_output())
545            + " ["
546            + str(str(self.get_output_format()))
547            + "]"
548        )
549        log.info("Config: ")
550        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
551            "\n"
552        ):
553            log.info("\t" + str(d))
554        log.info("Param: ")
555        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
556            "\n"
557        ):
558            log.info("\t" + str(d))
559        log.info("Sample list: " + str(self.get_header_sample_list()))
560        log.info("Dataframe: ")
561        for d in str(df).split("\n"):
562            log.info("\t" + str(d))
563
564        # garbage collector
565        del df
566        gc.collect()
567
568        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
570    def get_stats(self) -> dict:
571        """
572        The `get_stats` function calculates and returns various statistics of the current object,
573        including information about the input file, variants, samples, header fields, quality, and
574        SNVs/InDels.
575        :return: a dictionary containing various statistics of the current object. The dictionary has
576        the following structure:
577        """
578
579        # Log
580        log.info(f"Stats Calculation...")
581
582        # table varaints
583        table_variants_from = self.get_table_variants()
584
585        # stats dict
586        stats = {"Infos": {}}
587
588        ### File
589        input_file = self.get_input()
590        stats["Infos"]["Input file"] = input_file
591
592        # Header
593        header_infos = self.get_header().infos
594        header_formats = self.get_header().formats
595        header_infos_list = list(header_infos)
596        header_formats_list = list(header_formats)
597
598        ### Variants
599
600        stats["Variants"] = {}
601
602        # Variants by chr
603        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
604        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
605        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
606            by=["CHROM"], kind="quicksort"
607        )
608
609        # Total number of variants
610        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
611
612        # Calculate percentage
613        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
614            lambda x: (x / nb_of_variants)
615        )
616
617        stats["Variants"]["Number of variants by chromosome"] = (
618            nb_of_variants_by_chrom.to_dict(orient="index")
619        )
620
621        stats["Infos"]["Number of variants"] = int(nb_of_variants)
622
623        ### Samples
624
625        # Init
626        samples = {}
627        nb_of_samples = 0
628
629        # Check Samples
630        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
631            log.debug(f"Check samples...")
632            for sample in self.get_header_sample_list():
633                sql_query_samples = f"""
634                    SELECT  '{sample}' as sample,
635                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
636                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
637                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
638                    FROM {table_variants_from}
639                    WHERE (
640                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
641                        AND
642                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
643                      )
644                    GROUP BY genotype
645                    """
646                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
647                sample_genotype_count = sql_query_genotype_df["count"].sum()
648                if len(sql_query_genotype_df):
649                    nb_of_samples += 1
650                    samples[f"{sample} - {sample_genotype_count} variants"] = (
651                        sql_query_genotype_df.to_dict(orient="index")
652                    )
653
654            stats["Samples"] = samples
655            stats["Infos"]["Number of samples"] = nb_of_samples
656
657        # #
658        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
659        #     stats["Infos"]["Number of samples"] = nb_of_samples
660        # elif nb_of_samples:
661        #     stats["Infos"]["Number of samples"] = "not a VCF format"
662
663        ### INFO and FORMAT fields
664        header_types_df = {}
665        header_types_list = {
666            "List of INFO fields": header_infos,
667            "List of FORMAT fields": header_formats,
668        }
669        i = 0
670        for header_type in header_types_list:
671
672            header_type_infos = header_types_list.get(header_type)
673            header_infos_dict = {}
674
675            for info in header_type_infos:
676
677                i += 1
678                header_infos_dict[i] = {}
679
680                # ID
681                header_infos_dict[i]["id"] = info
682
683                # num
684                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
685                if header_type_infos[info].num in genotype_map.keys():
686                    header_infos_dict[i]["Number"] = genotype_map.get(
687                        header_type_infos[info].num
688                    )
689                else:
690                    header_infos_dict[i]["Number"] = header_type_infos[info].num
691
692                # type
693                if header_type_infos[info].type:
694                    header_infos_dict[i]["Type"] = header_type_infos[info].type
695                else:
696                    header_infos_dict[i]["Type"] = "."
697
698                # desc
699                if header_type_infos[info].desc != None:
700                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
701                else:
702                    header_infos_dict[i]["Description"] = ""
703
704            if len(header_infos_dict):
705                header_types_df[header_type] = pd.DataFrame.from_dict(
706                    header_infos_dict, orient="index"
707                ).to_dict(orient="index")
708
709        # Stats
710        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
711        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
712        stats["Header"] = header_types_df
713
714        ### QUAL
715        if "QUAL" in self.get_header_columns():
716            sql_query_qual = f"""
717                    SELECT
718                        avg(CAST(QUAL AS INTEGER)) AS Average,
719                        min(CAST(QUAL AS INTEGER)) AS Minimum,
720                        max(CAST(QUAL AS INTEGER)) AS Maximum,
721                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
722                        median(CAST(QUAL AS INTEGER)) AS Median,
723                        variance(CAST(QUAL AS INTEGER)) AS Variance
724                    FROM {table_variants_from}
725                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
726                    """
727
728            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
729            stats["Quality"] = {"Stats": qual}
730
731        ### SNV and InDel
732
733        sql_query_snv = f"""
734            
735            SELECT Type, count FROM (
736
737                    SELECT
738                        'Total' AS Type,
739                        count(*) AS count
740                    FROM {table_variants_from}
741
742                    UNION
743
744                    SELECT
745                        'MNV' AS Type,
746                        count(*) AS count
747                    FROM {table_variants_from}
748                    WHERE len(REF) > 1 AND len(ALT) > 1
749                    AND len(REF) = len(ALT)
750
751                    UNION
752
753                    SELECT
754                        'InDel' AS Type,
755                        count(*) AS count
756                    FROM {table_variants_from}
757                    WHERE len(REF) > 1 OR len(ALT) > 1
758                    AND len(REF) != len(ALT)
759                    
760                    UNION
761
762                    SELECT
763                        'SNV' AS Type,
764                        count(*) AS count
765                    FROM {table_variants_from}
766                    WHERE len(REF) = 1 AND len(ALT) = 1
767
768                )
769
770            ORDER BY count DESC
771
772                """
773        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
774
775        sql_query_snv_substitution = f"""
776                SELECT
777                    concat(REF, '>', ALT) AS 'Substitution',
778                    count(*) AS count
779                FROM {table_variants_from}
780                WHERE len(REF) = 1 AND len(ALT) = 1
781                GROUP BY REF, ALT
782                ORDER BY count(*) DESC
783                """
784        snv_substitution = (
785            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
786        )
787        stats["Variants"]["Counts"] = snv_indel
788        stats["Variants"]["Substitutions"] = snv_substitution
789
790        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
792    def stats_to_file(self, file: str = None) -> str:
793        """
794        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
795        into a JSON object, and writes the JSON object to the specified file.
796
797        :param file: The `file` parameter is a string that represents the file path where the JSON data
798        will be written
799        :type file: str
800        :return: the name of the file that was written to.
801        """
802
803        # Get stats
804        stats = self.get_stats()
805
806        # Serializing json
807        json_object = json.dumps(stats, indent=4)
808
809        # Writing to sample.json
810        with open(file, "w") as outfile:
811            outfile.write(json_object)
812
813        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
815    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
816        """
817        The `print_stats` function generates a markdown file and prints the statistics contained in a
818        JSON file in a formatted manner.
819
820        :param output_file: The `output_file` parameter is a string that specifies the path and filename
821        of the output file where the stats will be printed in Markdown format. If no `output_file` is
822        provided, a temporary directory will be created and the stats will be saved in a file named
823        "stats.md" within that
824        :type output_file: str
825        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
826        file where the statistics will be saved. If no value is provided, a temporary directory will be
827        created and a default file name "stats.json" will be used
828        :type json_file: str
829        :return: The function `print_stats` does not return any value. It has a return type annotation
830        of `None`.
831        """
832
833        # Full path
834        output_file = full_path(output_file)
835        json_file = full_path(json_file)
836
837        with tempfile.TemporaryDirectory() as tmpdir:
838
839            # Files
840            if not output_file:
841                output_file = os.path.join(tmpdir, "stats.md")
842            if not json_file:
843                json_file = os.path.join(tmpdir, "stats.json")
844
845            # Create folders
846            if not os.path.exists(os.path.dirname(output_file)):
847                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
848            if not os.path.exists(os.path.dirname(json_file)):
849                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
850
851            # Create stats JSON file
852            stats_file = self.stats_to_file(file=json_file)
853
854            # Print stats file
855            with open(stats_file) as f:
856                stats = yaml.safe_load(f)
857
858            # Output
859            output_title = []
860            output_index = []
861            output = []
862
863            # Title
864            output_title.append("# HOWARD Stats")
865
866            # Index
867            output_index.append("## Index")
868
869            # Process sections
870            for section in stats:
871                infos = stats.get(section)
872                section_link = "#" + section.lower().replace(" ", "-")
873                output.append(f"## {section}")
874                output_index.append(f"- [{section}]({section_link})")
875
876                if len(infos):
877                    for info in infos:
878                        try:
879                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
880                            is_df = True
881                        except:
882                            try:
883                                df = pd.DataFrame.from_dict(
884                                    json.loads((infos.get(info))), orient="index"
885                                )
886                                is_df = True
887                            except:
888                                is_df = False
889                        if is_df:
890                            output.append(f"### {info}")
891                            info_link = "#" + info.lower().replace(" ", "-")
892                            output_index.append(f"   - [{info}]({info_link})")
893                            output.append(f"{df.to_markdown(index=False)}")
894                        else:
895                            output.append(f"- {info}: {infos.get(info)}")
896                else:
897                    output.append(f"NA")
898
899            # Write stats in markdown file
900            with open(output_file, "w") as fp:
901                for item in output_title:
902                    fp.write("%s\n" % item)
903                for item in output_index:
904                    fp.write("%s\n" % item)
905                for item in output:
906                    fp.write("%s\n" % item)
907
908            # Output stats in markdown
909            print("")
910            print("\n\n".join(output_title))
911            print("")
912            print("\n\n".join(output))
913            print("")
914
915        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
917    def get_input(self) -> str:
918        """
919        It returns the value of the input variable.
920        :return: The input is being returned.
921        """
922        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
924    def get_input_format(self, input_file: str = None) -> str:
925        """
926        This function returns the format of the input variable, either from the provided input file or
927        by prompting for input.
928
929        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
930        represents the file path of the input file. If no `input_file` is provided when calling the
931        method, it will default to `None`
932        :type input_file: str
933        :return: The format of the input variable is being returned.
934        """
935
936        if not input_file:
937            input_file = self.get_input()
938        input_format = get_file_format(input_file)
939        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
941    def get_input_compressed(self, input_file: str = None) -> str:
942        """
943        The function `get_input_compressed` returns the format of the input variable after compressing
944        it.
945
946        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
947        that represents the file path of the input file. If no `input_file` is provided when calling the
948        method, it will default to `None` and the method will then call `self.get_input()` to
949        :type input_file: str
950        :return: The function `get_input_compressed` returns the compressed format of the input
951        variable.
952        """
953
954        if not input_file:
955            input_file = self.get_input()
956        input_compressed = get_file_compressed(input_file)
957        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
959    def get_output(self) -> str:
960        """
961        It returns the output of the neuron.
962        :return: The output of the neural network.
963        """
964
965        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
967    def get_output_format(self, output_file: str = None) -> str:
968        """
969        The function `get_output_format` returns the format of the input variable or the output file if
970        provided.
971
972        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
973        that represents the file path of the output file. If no `output_file` is provided when calling
974        the method, it will default to the output obtained from the `get_output` method of the class
975        instance. The
976        :type output_file: str
977        :return: The format of the input variable is being returned.
978        """
979
980        if not output_file:
981            output_file = self.get_output()
982        output_format = get_file_format(output_file)
983
984        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
986    def get_config(self) -> dict:
987        """
988        It returns the config
989        :return: The config variable is being returned.
990        """
991        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
993    def get_param(self) -> dict:
994        """
995        It returns the param
996        :return: The param variable is being returned.
997        """
998        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
1000    def get_connexion_db(self) -> str:
1001        """
1002        It returns the connexion_db attribute of the object
1003        :return: The connexion_db is being returned.
1004        """
1005        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
1007    def get_prefix(self) -> str:
1008        """
1009        It returns the prefix of the object.
1010        :return: The prefix is being returned.
1011        """
1012        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
1014    def get_table_variants(self, clause: str = "select") -> str:
1015        """
1016        This function returns the table_variants attribute of the object
1017
1018        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
1019        defaults to select (optional)
1020        :return: The table_variants attribute of the object.
1021        """
1022
1023        # Access
1024        access = self.get_config().get("access", None)
1025
1026        # Clauses "select", "where", "update"
1027        if clause in ["select", "where", "update"]:
1028            table_variants = self.table_variants
1029        # Clause "from"
1030        elif clause in ["from"]:
1031            # For Read Only
1032            if self.get_input_format() in ["parquet"] and access in ["RO"]:
1033                input_file = self.get_input()
1034                table_variants = f"'{input_file}' as variants"
1035            # For Read Write
1036            else:
1037                table_variants = f"{self.table_variants} as variants"
1038        else:
1039            table_variants = self.table_variants
1040        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
1042    def get_tmp_dir(self) -> str:
1043        """
1044        The function `get_tmp_dir` returns the temporary directory path based on configuration
1045        parameters or a default path.
1046        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1047        configuration, parameters, and a default value of "/tmp".
1048        """
1049
1050        return get_tmp(
1051            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1052        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1054    def get_connexion_type(self) -> str:
1055        """
1056        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1057
1058        :return: The connexion type is being returned.
1059        """
1060        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1062    def get_connexion(self):
1063        """
1064        It returns the connection object
1065
1066        :return: The connection object.
1067        """
1068        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1070    def close_connexion(self) -> None:
1071        """
1072        This function closes the connection to the database.
1073        :return: The connection is being closed.
1074        """
1075        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1077    def get_header(self, type: str = "vcf"):
1078        """
1079        This function returns the header of the VCF file as a list of strings
1080
1081        :param type: the type of header you want to get, defaults to vcf (optional)
1082        :return: The header of the vcf file.
1083        """
1084
1085        if self.header_vcf:
1086            if type == "vcf":
1087                return self.header_vcf
1088            elif type == "list":
1089                return self.header_list
1090        else:
1091            if type == "vcf":
1092                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1093                return header
1094            elif type == "list":
1095                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_infos_list(self) -> list:
1097    def get_header_infos_list(self) -> list:
1098        """
1099        This function retrieves a list of information fields from the header.
1100        :return: A list of information fields from the header.
1101        """
1102
1103        # Init
1104        infos_list = []
1105
1106        for field in self.get_header().infos:
1107            infos_list.append(field)
1108
1109        return infos_list

This function retrieves a list of information fields from the header.

Returns

A list of information fields from the header.

def get_header_length(self, file: str = None) -> int:
1111    def get_header_length(self, file: str = None) -> int:
1112        """
1113        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1114        line.
1115
1116        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1117        header file. If this argument is provided, the function will read the header from the specified
1118        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1119        :type file: str
1120        :return: the length of the header list, excluding the #CHROM line.
1121        """
1122
1123        if file:
1124            return len(self.read_vcf_header_file(file=file)) - 1
1125        elif self.get_header(type="list"):
1126            return len(self.get_header(type="list")) - 1
1127        else:
1128            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1130    def get_header_columns(self) -> str:
1131        """
1132        This function returns the header list of a VCF
1133
1134        :return: The length of the header list.
1135        """
1136        if self.get_header():
1137            return self.get_header(type="list")[-1]
1138        else:
1139            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1141    def get_header_columns_as_list(self) -> list:
1142        """
1143        This function returns the header list of a VCF
1144
1145        :return: The length of the header list.
1146        """
1147        if self.get_header():
1148            return self.get_header_columns().strip().split("\t")
1149        else:
1150            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1152    def get_header_columns_as_sql(self) -> str:
1153        """
1154        This function retruns header length (without #CHROM line)
1155
1156        :return: The length of the header list.
1157        """
1158        sql_column_list = []
1159        for col in self.get_header_columns_as_list():
1160            sql_column_list.append(f'"{col}"')
1161        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list( self, check: bool = False, samples: list = None, samples_force: bool = False) -> list:
1163    def get_header_sample_list(
1164        self, check: bool = False, samples: list = None, samples_force: bool = False
1165    ) -> list:
1166        """
1167        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
1168        checking and filtering based on input parameters.
1169
1170        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
1171        parameter that determines whether to check if the samples in the list are properly defined as
1172        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
1173        list is defined as a, defaults to False
1174        :type check: bool (optional)
1175        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
1176        allows you to specify a subset of samples from the header. If you provide a list of sample
1177        names, the function will check if each sample is defined in the header. If a sample is not found
1178        in the
1179        :type samples: list
1180        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
1181        a boolean parameter that determines whether to force the function to return the sample list
1182        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
1183        function will return the sample list without performing, defaults to False
1184        :type samples_force: bool (optional)
1185        :return: The function `get_header_sample_list` returns a list of samples based on the input
1186        parameters and conditions specified in the function.
1187        """
1188
1189        # Init
1190        samples_list = []
1191
1192        if samples is None:
1193            samples_list = self.header_vcf.samples
1194        else:
1195            samples_checked = []
1196            for sample in samples:
1197                if sample in self.header_vcf.samples:
1198                    samples_checked.append(sample)
1199                else:
1200                    log.warning(f"Sample '{sample}' not defined in header")
1201            samples_list = samples_checked
1202
1203            # Force sample list without checking if is_genotype_column
1204            if samples_force:
1205                log.warning(f"Samples {samples_list} not checked if genotypes")
1206                return samples_list
1207
1208        if check:
1209            samples_checked = []
1210            for sample in samples_list:
1211                if self.is_genotype_column(column=sample):
1212                    samples_checked.append(sample)
1213                else:
1214                    log.warning(
1215                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
1216                    )
1217            samples_list = samples_checked
1218
1219        # Return samples list
1220        return samples_list

The function get_header_sample_list returns a list of samples from a VCF header, with optional checking and filtering based on input parameters.

Parameters
  • check: The check parameter in the get_header_sample_list function is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. If check is set to True, the function will verify if each sample in the list is defined as a, defaults to False
  • samples: The samples parameter in the get_header_sample_list function is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the
  • samples_force: The samples_force parameter in the get_header_sample_list function is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. If samples_force is set to True, the function will return the sample list without performing, defaults to False
Returns

The function get_header_sample_list returns a list of samples based on the input parameters and conditions specified in the function.

def is_genotype_column(self, column: str = None) -> bool:
1222    def is_genotype_column(self, column: str = None) -> bool:
1223        """
1224        This function checks if a given column is a genotype column in a database.
1225
1226        :param column: The `column` parameter in the `is_genotype_column` method is a string that
1227        represents the column name in a database table. This method checks if the specified column is a
1228        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
1229        method of
1230        :type column: str
1231        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
1232        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
1233        column name and returns the result. If the `column` parameter is None, it returns False.
1234        """
1235
1236        if column is not None:
1237            return Database(database=self.get_input()).is_genotype_column(column=column)
1238        else:
1239            return False

This function checks if a given column is a genotype column in a database.

Parameters
  • column: The column parameter in the is_genotype_column method is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls the is_genotype_column method of
Returns

The is_genotype_column method is returning a boolean value. If the column parameter is not None, it calls the is_genotype_column method of the Database class with the specified column name and returns the result. If the column parameter is None, it returns False.

def get_verbose(self) -> bool:
1241    def get_verbose(self) -> bool:
1242        """
1243        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1244        exist
1245
1246        :return: The value of the key "verbose" in the config dictionary.
1247        """
1248        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1250    def get_connexion_format(self) -> str:
1251        """
1252        It returns the connexion format of the object.
1253        :return: The connexion_format is being returned.
1254        """
1255        connexion_format = self.connexion_format
1256        if connexion_format not in ["duckdb", "sqlite"]:
1257            log.error(f"Unknown connexion format {connexion_format}")
1258            raise ValueError(f"Unknown connexion format {connexion_format}")
1259        else:
1260            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1262    def insert_file_to_table(
1263        self,
1264        file,
1265        columns: str,
1266        header_len: int = 0,
1267        sep: str = "\t",
1268        chunksize: int = 1000000,
1269    ) -> None:
1270        """
1271        The function reads a file in chunks and inserts each chunk into a table based on the specified
1272        database format.
1273
1274        :param file: The `file` parameter is the file that you want to load into a table. It should be
1275        the path to the file on your system
1276        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1277        should contain the names of the columns in the table where the data will be inserted. The column
1278        names should be separated by commas within the string. For example, if you have columns named
1279        "id", "name
1280        :type columns: str
1281        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1282        the number of lines to skip at the beginning of the file before reading the actual data. This
1283        parameter allows you to skip any header information present in the file before processing the
1284        data, defaults to 0
1285        :type header_len: int (optional)
1286        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1287        separator character that is used in the file being read. In this case, the default separator is
1288        set to `\t`, which represents a tab character. You can change this parameter to a different
1289        separator character if, defaults to \t
1290        :type sep: str (optional)
1291        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1292        when processing the file in chunks. In the provided code snippet, the default value for
1293        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1294        to 1000000
1295        :type chunksize: int (optional)
1296        """
1297
1298        # Config
1299        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1300        connexion_format = self.get_connexion_format()
1301
1302        log.debug("chunksize: " + str(chunksize))
1303
1304        if chunksize:
1305            for chunk in pd.read_csv(
1306                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1307            ):
1308                if connexion_format in ["duckdb"]:
1309                    sql_insert_into = (
1310                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1311                    )
1312                    self.conn.execute(sql_insert_into)
1313                elif connexion_format in ["sqlite"]:
1314                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1316    def load_data(
1317        self,
1318        input_file: str = None,
1319        drop_variants_table: bool = False,
1320        sample_size: int = 20480,
1321    ) -> None:
1322        """
1323        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1324        table before loading the data and specify a sample size.
1325
1326        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1327        table
1328        :type input_file: str
1329        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1330        determines whether the variants table should be dropped before loading the data. If set to
1331        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1332        not be dropped, defaults to False
1333        :type drop_variants_table: bool (optional)
1334        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1335        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1336        20480
1337        :type sample_size: int (optional)
1338        """
1339
1340        log.info("Loading...")
1341
1342        # change input file
1343        if input_file:
1344            self.set_input(input_file)
1345            self.set_header()
1346
1347        # drop variants table
1348        if drop_variants_table:
1349            self.drop_variants_table()
1350
1351        # get table variants
1352        table_variants = self.get_table_variants()
1353
1354        # Access
1355        access = self.get_config().get("access", None)
1356        log.debug(f"access: {access}")
1357
1358        # Input format and compress
1359        input_format = self.get_input_format()
1360        input_compressed = self.get_input_compressed()
1361        log.debug(f"input_format: {input_format}")
1362        log.debug(f"input_compressed: {input_compressed}")
1363
1364        # input_compressed_format
1365        if input_compressed:
1366            input_compressed_format = "gzip"
1367        else:
1368            input_compressed_format = "none"
1369        log.debug(f"input_compressed_format: {input_compressed_format}")
1370
1371        # Connexion format
1372        connexion_format = self.get_connexion_format()
1373
1374        # Sample size
1375        if not sample_size:
1376            sample_size = -1
1377        log.debug(f"sample_size: {sample_size}")
1378
1379        # Load data
1380        log.debug(f"Load Data from {input_format}")
1381
1382        # DuckDB connexion
1383        if connexion_format in ["duckdb"]:
1384
1385            # Database already exists
1386            if self.input_format in ["db", "duckdb"]:
1387
1388                if connexion_format in ["duckdb"]:
1389                    log.debug(f"Input file format '{self.input_format}' duckDB")
1390                else:
1391                    log.error(
1392                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1393                    )
1394                    raise ValueError(
1395                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1396                    )
1397
1398            # Load from existing database format
1399            else:
1400
1401                try:
1402                    # Create Table or View
1403                    database = Database(database=self.input)
1404                    sql_from = database.get_sql_from(sample_size=sample_size)
1405
1406                    if access in ["RO"]:
1407                        sql_load = (
1408                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1409                        )
1410                    else:
1411                        sql_load = (
1412                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1413                        )
1414                    self.conn.execute(sql_load)
1415
1416                except:
1417                    # Format not available
1418                    log.error(f"Input file format '{self.input_format}' not available")
1419                    raise ValueError(
1420                        f"Input file format '{self.input_format}' not available"
1421                    )
1422
1423        # SQLite connexion
1424        elif connexion_format in ["sqlite"] and input_format in [
1425            "vcf",
1426            "tsv",
1427            "csv",
1428            "psv",
1429        ]:
1430
1431            # Main structure
1432            structure = {
1433                "#CHROM": "VARCHAR",
1434                "POS": "INTEGER",
1435                "ID": "VARCHAR",
1436                "REF": "VARCHAR",
1437                "ALT": "VARCHAR",
1438                "QUAL": "VARCHAR",
1439                "FILTER": "VARCHAR",
1440                "INFO": "VARCHAR",
1441            }
1442
1443            # Strcuture with samples
1444            structure_complete = structure
1445            if self.get_header_sample_list():
1446                structure["FORMAT"] = "VARCHAR"
1447                for sample in self.get_header_sample_list():
1448                    structure_complete[sample] = "VARCHAR"
1449
1450            # Columns list for create and insert
1451            sql_create_table_columns = []
1452            sql_create_table_columns_list = []
1453            for column in structure_complete:
1454                column_type = structure_complete[column]
1455                sql_create_table_columns.append(
1456                    f'"{column}" {column_type} default NULL'
1457                )
1458                sql_create_table_columns_list.append(f'"{column}"')
1459
1460            # Create database
1461            log.debug(f"Create Table {table_variants}")
1462            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1463            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1464            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1465            self.conn.execute(sql_create_table)
1466
1467            # chunksize define length of file chunk load file
1468            chunksize = 100000
1469
1470            # delimiter
1471            delimiter = file_format_delimiters.get(input_format, "\t")
1472
1473            # Load the input file
1474            with open(self.input, "rt") as input_file:
1475
1476                # Use the appropriate file handler based on the input format
1477                if input_compressed:
1478                    input_file = bgzf.open(self.input, "rt")
1479                if input_format in ["vcf"]:
1480                    header_len = self.get_header_length()
1481                else:
1482                    header_len = 0
1483
1484                # Insert the file contents into a table
1485                self.insert_file_to_table(
1486                    input_file,
1487                    columns=sql_create_table_columns_list_sql,
1488                    header_len=header_len,
1489                    sep=delimiter,
1490                    chunksize=chunksize,
1491                )
1492
1493        else:
1494            log.error(
1495                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1496            )
1497            raise ValueError(
1498                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1499            )
1500
1501        # Explode INFOS fields into table fields
1502        if self.get_explode_infos():
1503            self.explode_infos(
1504                prefix=self.get_explode_infos_prefix(),
1505                fields=self.get_explode_infos_fields(),
1506                force=True,
1507            )
1508
1509        # Create index after insertion
1510        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1512    def get_explode_infos(self) -> bool:
1513        """
1514        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1515        to False if it is not set.
1516        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1517        value. If the parameter is not present, it will return False.
1518        """
1519
1520        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1522    def get_explode_infos_fields(
1523        self,
1524        explode_infos_fields: str = None,
1525        remove_fields_not_in_header: bool = False,
1526    ) -> list:
1527        """
1528        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1529        the input parameter `explode_infos_fields`.
1530
1531        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1532        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1533        comma-separated list of field names to explode
1534        :type explode_infos_fields: str
1535        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1536        flag that determines whether to remove fields that are not present in the header. If it is set
1537        to `True`, any field that is not in the header will be excluded from the list of exploded
1538        information fields. If it is set to `, defaults to False
1539        :type remove_fields_not_in_header: bool (optional)
1540        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1541        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1542        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1543        Otherwise, it returns a list of exploded information fields after removing any spaces and
1544        splitting the string by commas.
1545        """
1546
1547        # If no fields, get it in param
1548        if not explode_infos_fields:
1549            explode_infos_fields = (
1550                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1551            )
1552
1553        # If no fields, defined as all fields in header using keyword
1554        if not explode_infos_fields:
1555            explode_infos_fields = "*"
1556
1557        # If fields list not empty
1558        if explode_infos_fields:
1559
1560            # Input fields list
1561            if isinstance(explode_infos_fields, str):
1562                fields_input = explode_infos_fields.split(",")
1563            elif isinstance(explode_infos_fields, list):
1564                fields_input = explode_infos_fields
1565            else:
1566                fields_input = []
1567
1568            # Fields list without * keyword
1569            fields_without_all = fields_input.copy()
1570            if "*".casefold() in (item.casefold() for item in fields_without_all):
1571                fields_without_all.remove("*")
1572
1573            # Fields in header
1574            fields_in_header = sorted(list(set(self.get_header().infos)))
1575
1576            # Construct list of fields
1577            fields_output = []
1578            for field in fields_input:
1579
1580                # Strip field
1581                field = field.strip()
1582
1583                # format keyword * in regex
1584                if field.upper() in ["*"]:
1585                    field = ".*"
1586
1587                # Find all fields with pattern
1588                r = re.compile(field)
1589                fields_search = sorted(list(filter(r.match, fields_in_header)))
1590
1591                # Remove fields input from search
1592                if field in fields_search:
1593                    fields_search = [field]
1594                elif fields_search != [field]:
1595                    fields_search = sorted(
1596                        list(set(fields_search).difference(fields_input))
1597                    )
1598
1599                # If field is not in header (avoid not well formatted header)
1600                if not fields_search and not remove_fields_not_in_header:
1601                    fields_search = [field]
1602
1603                # Add found fields
1604                for new_field in fields_search:
1605                    # Add field, if not already exists, and if it is in header (if asked)
1606                    if (
1607                        new_field not in fields_output
1608                        and (
1609                            not remove_fields_not_in_header
1610                            or new_field in fields_in_header
1611                        )
1612                        and new_field not in [".*"]
1613                    ):
1614                        fields_output.append(new_field)
1615
1616            return fields_output
1617
1618        else:
1619
1620            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1622    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1623        """
1624        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1625        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1626        not provided.
1627
1628        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1629        prefix to be used for exploding or expanding information
1630        :type explode_infos_prefix: str
1631        :return: the value of the variable `explode_infos_prefix`.
1632        """
1633
1634        if not explode_infos_prefix:
1635            explode_infos_prefix = (
1636                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1637            )
1638
1639        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1641    def add_column(
1642        self,
1643        table_name,
1644        column_name,
1645        column_type,
1646        default_value=None,
1647        drop: bool = False,
1648    ) -> dict:
1649        """
1650        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1651        doesn't already exist.
1652
1653        :param table_name: The name of the table to which you want to add a column
1654        :param column_name: The parameter "column_name" is the name of the column that you want to add
1655        to the table
1656        :param column_type: The `column_type` parameter specifies the data type of the column that you
1657        want to add to the table. It should be a string that represents the desired data type, such as
1658        "INTEGER", "TEXT", "REAL", etc
1659        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1660        default value for the newly added column. If a default value is provided, it will be assigned to
1661        the column for any existing rows that do not have a value for that column
1662        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1663        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1664        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1665        to False
1666        :type drop: bool (optional)
1667        :return: a boolean value indicating whether the column was successfully added to the table.
1668        """
1669
1670        # added
1671        added = False
1672        dropped = False
1673
1674        # Check if the column already exists in the table
1675        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1676        columns = self.get_query_to_df(query).columns.tolist()
1677        if column_name.upper() in [c.upper() for c in columns]:
1678            log.debug(
1679                f"The {column_name} column already exists in the {table_name} table"
1680            )
1681            if drop:
1682                self.drop_column(table_name=table_name, column_name=column_name)
1683                dropped = True
1684            else:
1685                return None
1686        else:
1687            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1688
1689        # Add column in table
1690        add_column_query = (
1691            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1692        )
1693        if default_value is not None:
1694            add_column_query += f" DEFAULT {default_value}"
1695        self.execute_query(add_column_query)
1696        added = not dropped
1697        log.debug(
1698            f"The {column_name} column was successfully added to the {table_name} table"
1699        )
1700
1701        if added:
1702            added_column = {
1703                "table_name": table_name,
1704                "column_name": column_name,
1705                "column_type": column_type,
1706                "default_value": default_value,
1707            }
1708        else:
1709            added_column = None
1710
1711        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1713    def drop_column(
1714        self, column: dict = None, table_name: str = None, column_name: str = None
1715    ) -> bool:
1716        """
1717        The `drop_column` function drops a specified column from a given table in a database and returns
1718        True if the column was successfully dropped, and False if the column does not exist in the
1719        table.
1720
1721        :param column: The `column` parameter is a dictionary that contains information about the column
1722        you want to drop. It has two keys:
1723        :type column: dict
1724        :param table_name: The `table_name` parameter is the name of the table from which you want to
1725        drop a column
1726        :type table_name: str
1727        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1728        from the table
1729        :type column_name: str
1730        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1731        and False if the column does not exist in the table.
1732        """
1733
1734        # Find column infos
1735        if column:
1736            if isinstance(column, dict):
1737                table_name = column.get("table_name", None)
1738                column_name = column.get("column_name", None)
1739            elif isinstance(column, str):
1740                table_name = self.get_table_variants()
1741                column_name = column
1742            else:
1743                table_name = None
1744                column_name = None
1745
1746        if not table_name and not column_name:
1747            return False
1748
1749        # Removed
1750        removed = False
1751
1752        # Check if the column already exists in the table
1753        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1754        columns = self.get_query_to_df(query).columns.tolist()
1755        if column_name in columns:
1756            log.debug(f"The {column_name} column exists in the {table_name} table")
1757        else:
1758            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1759            return False
1760
1761        # Add column in table # ALTER TABLE integers DROP k
1762        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1763        self.execute_query(add_column_query)
1764        removed = True
1765        log.debug(
1766            f"The {column_name} column was successfully dropped to the {table_name} table"
1767        )
1768
1769        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1771    def explode_infos(
1772        self,
1773        prefix: str = None,
1774        create_index: bool = False,
1775        fields: list = None,
1776        force: bool = False,
1777        proccess_all_fields_together: bool = False,
1778        table: str = None,
1779    ) -> list:
1780        """
1781        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1782        individual columns, returning a list of added columns.
1783
1784        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1785        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1786        `self.get_explode_infos_prefix()` as the prefix
1787        :type prefix: str
1788        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1789        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1790        `False`, indexes will not be created. The default value is `False`, defaults to False
1791        :type create_index: bool (optional)
1792        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1793        that you want to explode into individual columns. If this parameter is not provided, all INFO
1794        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1795        a list to the `
1796        :type fields: list
1797        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1798        determines whether to drop and recreate a column if it already exists in the table. If `force`
1799        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1800        defaults to False
1801        :type force: bool (optional)
1802        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1803        flag that determines whether to process all the INFO fields together or individually. If set to
1804        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1805        be processed individually. The default value is, defaults to False
1806        :type proccess_all_fields_together: bool (optional)
1807        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1808        of the table where the exploded INFO fields will be added as individual columns. If you provide
1809        a value for the `table` parameter, the function will use that table name. If the `table`
1810        parameter is
1811        :type table: str
1812        :return: The `explode_infos` function returns a list of added columns.
1813        """
1814
1815        # drop indexes
1816        self.drop_indexes()
1817
1818        # connexion format
1819        connexion_format = self.get_connexion_format()
1820
1821        # Access
1822        access = self.get_config().get("access", None)
1823
1824        # Added columns
1825        added_columns = []
1826
1827        if access not in ["RO"]:
1828
1829            # prefix
1830            if prefix in [None, True] or not isinstance(prefix, str):
1831                if self.get_explode_infos_prefix() not in [None, True]:
1832                    prefix = self.get_explode_infos_prefix()
1833                else:
1834                    prefix = "INFO/"
1835
1836            # table variants
1837            if table is not None:
1838                table_variants = table
1839            else:
1840                table_variants = self.get_table_variants(clause="select")
1841
1842            # extra infos
1843            try:
1844                extra_infos = self.get_extra_infos()
1845            except:
1846                extra_infos = []
1847
1848            # Header infos
1849            header_infos = self.get_header().infos
1850
1851            log.debug(
1852                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1853            )
1854
1855            sql_info_alter_table_array = []
1856
1857            # Info fields to check
1858            fields_list = list(header_infos)
1859            if fields:
1860                fields_list += fields
1861            fields_list = set(fields_list)
1862
1863            # If no fields
1864            if not fields:
1865                fields = []
1866
1867            # Translate fields if patterns
1868            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1869
1870            for info in fields:
1871
1872                info_id_sql = prefix + info
1873
1874                if (
1875                    info in fields_list
1876                    or prefix + info in fields_list
1877                    or info in extra_infos
1878                ):
1879
1880                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1881
1882                    if info in header_infos:
1883                        info_type = header_infos[info].type
1884                        info_num = header_infos[info].num
1885                    else:
1886                        info_type = "String"
1887                        info_num = 0
1888
1889                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1890                    if info_num != 1:
1891                        type_sql = "VARCHAR"
1892
1893                    # Add field
1894                    added_column = self.add_column(
1895                        table_name=table_variants,
1896                        column_name=info_id_sql,
1897                        column_type=type_sql,
1898                        default_value="null",
1899                        drop=force,
1900                    )
1901
1902                    if added_column:
1903                        added_columns.append(added_column)
1904
1905                    if added_column or force:
1906
1907                        # add field to index
1908                        self.index_additionnal_fields.append(info_id_sql)
1909
1910                        # Update field array
1911                        if connexion_format in ["duckdb"]:
1912                            update_info_field = f"""
1913                            "{info_id_sql}" =
1914                                CASE
1915                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1916                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1917                                END
1918                            """
1919                        elif connexion_format in ["sqlite"]:
1920                            update_info_field = f"""
1921                                "{info_id_sql}" =
1922                                    CASE
1923                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1924                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1925                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1926                                    END
1927                            """
1928
1929                        sql_info_alter_table_array.append(update_info_field)
1930
1931            if sql_info_alter_table_array:
1932
1933                # By chromosomes
1934                try:
1935                    chromosomes_list = list(
1936                        self.get_query_to_df(
1937                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1938                        )["#CHROM"]
1939                    )
1940                except:
1941                    chromosomes_list = [None]
1942
1943                for chrom in chromosomes_list:
1944                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1945
1946                    # Where clause
1947                    where_clause = ""
1948                    if chrom and len(chromosomes_list) > 1:
1949                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1950
1951                    # Update table
1952                    if proccess_all_fields_together:
1953                        sql_info_alter_table_array_join = ", ".join(
1954                            sql_info_alter_table_array
1955                        )
1956                        if sql_info_alter_table_array_join:
1957                            sql_info_alter_table = f"""
1958                                UPDATE {table_variants}
1959                                SET {sql_info_alter_table_array_join}
1960                                {where_clause}
1961                                """
1962                            log.debug(
1963                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1964                            )
1965                            # log.debug(sql_info_alter_table)
1966                            self.conn.execute(sql_info_alter_table)
1967                    else:
1968                        sql_info_alter_num = 0
1969                        for sql_info_alter in sql_info_alter_table_array:
1970                            sql_info_alter_num += 1
1971                            sql_info_alter_table = f"""
1972                                UPDATE {table_variants}
1973                                SET {sql_info_alter}
1974                                {where_clause}
1975                                """
1976                            log.debug(
1977                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1978                            )
1979                            # log.debug(sql_info_alter_table)
1980                            self.conn.execute(sql_info_alter_table)
1981
1982        # create indexes
1983        if create_index:
1984            self.create_indexes()
1985
1986        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1988    def create_indexes(self) -> None:
1989        """
1990        Create indexes on the table after insertion
1991        """
1992
1993        # Access
1994        access = self.get_config().get("access", None)
1995
1996        # get table variants
1997        table_variants = self.get_table_variants("FROM")
1998
1999        if self.get_indexing() and access not in ["RO"]:
2000            # Create index
2001            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
2002            self.conn.execute(sql_create_table_index)
2003            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
2004            self.conn.execute(sql_create_table_index)
2005            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
2006            self.conn.execute(sql_create_table_index)
2007            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
2008            self.conn.execute(sql_create_table_index)
2009            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
2010            self.conn.execute(sql_create_table_index)
2011            for field in self.index_additionnal_fields:
2012                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
2013                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
2015    def drop_indexes(self) -> None:
2016        """
2017        Create indexes on the table after insertion
2018        """
2019
2020        # Access
2021        access = self.get_config().get("access", None)
2022
2023        # get table variants
2024        table_variants = self.get_table_variants("FROM")
2025
2026        # Get database format
2027        connexion_format = self.get_connexion_format()
2028
2029        if access not in ["RO"]:
2030            if connexion_format in ["duckdb"]:
2031                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
2032            elif connexion_format in ["sqlite"]:
2033                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
2034
2035            list_indexes = self.conn.execute(sql_list_indexes)
2036            index_names = [row[0] for row in list_indexes.fetchall()]
2037            for index in index_names:
2038                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
2039                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
2041    def read_vcf_header(self, f) -> list:
2042        """
2043        It reads the header of a VCF file and returns a list of the header lines
2044
2045        :param f: the file object
2046        :return: The header lines of the VCF file.
2047        """
2048
2049        header_list = []
2050        for line in f:
2051            header_list.append(line)
2052            if line.startswith("#CHROM"):
2053                break
2054        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
2056    def read_vcf_header_file(self, file: str = None) -> list:
2057        """
2058        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
2059        uncompressed files.
2060
2061        :param file: The `file` parameter is a string that represents the path to the VCF header file
2062        that you want to read. It is an optional parameter, so if you don't provide a value, it will
2063        default to `None`
2064        :type file: str
2065        :return: The function `read_vcf_header_file` returns a list.
2066        """
2067
2068        if self.get_input_compressed(input_file=file):
2069            with bgzf.open(file, "rt") as f:
2070                return self.read_vcf_header(f=f)
2071        else:
2072            with open(file, "rt") as f:
2073                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
2075    def execute_query(self, query: str):
2076        """
2077        It takes a query as an argument, executes it, and returns the results
2078
2079        :param query: The query to be executed
2080        :return: The result of the query is being returned.
2081        """
2082        if query:
2083            return self.conn.execute(query)  # .fetchall()
2084        else:
2085            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None) -> bool:
2087    def export_output(
2088        self,
2089        output_file: str | None = None,
2090        output_header: str | None = None,
2091        export_header: bool = True,
2092        query: str | None = None,
2093        parquet_partitions: list | None = None,
2094        chunk_size: int | None = None,
2095        threads: int | None = None,
2096        sort: bool = False,
2097        index: bool = False,
2098        order_by: str | None = None,
2099    ) -> bool:
2100        """
2101        The `export_output` function exports data from a VCF file to a specified output file in various
2102        formats, including VCF, CSV, TSV, PSV, and Parquet.
2103
2104        :param output_file: The `output_file` parameter is a string that specifies the name of the
2105        output file to be generated by the function. This is where the exported data will be saved
2106        :type output_file: str
2107        :param output_header: The `output_header` parameter is a string that specifies the name of the
2108        file where the header of the VCF file will be exported. If this parameter is not provided, the
2109        header will be exported to a file with the same name as the `output_file` parameter, but with
2110        the extension "
2111        :type output_header: str
2112        :param export_header: The `export_header` parameter is a boolean flag that determines whether
2113        the header of a VCF file should be exported to a separate file or not. If `export_header` is
2114        True, the header will be exported to a file. If `export_header` is False, the header will not
2115        be, defaults to True, if output format is not VCF
2116        :type export_header: bool (optional)
2117        :param query: The `query` parameter is an optional SQL query that can be used to filter and
2118        select specific data from the VCF file before exporting it. If provided, only the data that
2119        matches the query will be exported
2120        :type query: str
2121        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
2122        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
2123        organize data in a hierarchical directory structure based on the values of one or more columns.
2124        This can improve query performance when working with large datasets
2125        :type parquet_partitions: list
2126        :param chunk_size: The `chunk_size` parameter specifies the number of
2127        records in batch when exporting data in Parquet format. This parameter is used for
2128        partitioning the Parquet file into multiple files.
2129        :type chunk_size: int
2130        :param threads: The `threads` parameter is an optional parameter that specifies the number of
2131        threads to be used during the export process. It determines the level of parallelism and can
2132        improve the performance of the export operation. If not provided, the function will use the
2133        default number of threads
2134        :type threads: int
2135        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
2136        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
2137        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
2138        False
2139        :type sort: bool (optional)
2140        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2141        created on the output file. If `index` is True, an index will be created. If `index` is False,
2142        no index will be created. The default value is False, defaults to False
2143        :type index: bool (optional)
2144        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2145        sorting the output file. This parameter is only applicable when exporting data in VCF format
2146        :type order_by: str
2147        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2148        None if it doesn't.
2149        """
2150
2151        # Log
2152        log.info("Exporting...")
2153
2154        # Full path
2155        output_file = full_path(output_file)
2156        output_header = full_path(output_header)
2157
2158        # Config
2159        config = self.get_config()
2160
2161        # Param
2162        param = self.get_param()
2163
2164        # Tmp files to remove
2165        tmp_to_remove = []
2166
2167        # If no output, get it
2168        if not output_file:
2169            output_file = self.get_output()
2170
2171        # If not threads
2172        if not threads:
2173            threads = self.get_threads()
2174
2175        # Auto header name with extension
2176        if export_header or output_header:
2177            if not output_header:
2178                output_header = f"{output_file}.hdr"
2179            # Export header
2180            self.export_header(output_file=output_file)
2181
2182        # Switch off export header if VCF output
2183        output_file_type = get_file_format(output_file)
2184        if output_file_type in ["vcf"]:
2185            export_header = False
2186            tmp_to_remove.append(output_header)
2187
2188        # Chunk size
2189        if not chunk_size:
2190            chunk_size = config.get("chunk_size", None)
2191
2192        # Parquet partition
2193        if not parquet_partitions:
2194            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2195        if parquet_partitions and isinstance(parquet_partitions, str):
2196            parquet_partitions = parquet_partitions.split(",")
2197
2198        # Order by
2199        if not order_by:
2200            order_by = param.get("export", {}).get("order_by", "")
2201
2202        # Header in output
2203        header_in_output = param.get("export", {}).get("include_header", False)
2204
2205        # Database
2206        database_source = self.get_connexion()
2207
2208        # Connexion format
2209        connexion_format = self.get_connexion_format()
2210
2211        # Explode infos
2212        if self.get_explode_infos():
2213            self.explode_infos(
2214                prefix=self.get_explode_infos_prefix(),
2215                fields=self.get_explode_infos_fields(),
2216                force=False,
2217            )
2218
2219        # if connexion_format in ["sqlite"] or query:
2220        if connexion_format in ["sqlite"]:
2221
2222            # Export in Parquet
2223            random_tmp = "".join(
2224                random.choice(string.ascii_lowercase) for i in range(10)
2225            )
2226            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2227            tmp_to_remove.append(database_source)
2228
2229            # Table Variants
2230            table_variants = self.get_table_variants()
2231
2232            # Create export query
2233            sql_query_export_subquery = f"""
2234                SELECT * FROM {table_variants}
2235                """
2236
2237            # Write source file
2238            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2239
2240        # Create database
2241        database = Database(
2242            database=database_source,
2243            table="variants",
2244            header_file=output_header,
2245            conn_config=self.get_connexion_config(),
2246        )
2247
2248        # Existing colomns header
2249        existing_columns_header = database.get_header_columns_from_database(query=query)
2250
2251        # Sample list
2252        if output_file_type in ["vcf"]:
2253            get_samples = self.get_samples()
2254            get_samples_check = self.get_samples_check()
2255            samples_force = get_samples is not None
2256            sample_list = self.get_header_sample_list(
2257                check=get_samples_check,
2258                samples=get_samples,
2259                samples_force=samples_force,
2260            )
2261        else:
2262            sample_list = None
2263
2264        # Export file
2265        database.export(
2266            output_database=output_file,
2267            output_header=output_header,
2268            existing_columns_header=existing_columns_header,
2269            parquet_partitions=parquet_partitions,
2270            chunk_size=chunk_size,
2271            threads=threads,
2272            sort=sort,
2273            index=index,
2274            header_in_output=header_in_output,
2275            order_by=order_by,
2276            query=query,
2277            export_header=export_header,
2278            sample_list=sample_list,
2279        )
2280
2281        # Remove
2282        remove_if_exists(tmp_to_remove)
2283
2284        return (os.path.exists(output_file) or None) and (
2285            os.path.exists(output_file) or None
2286        )

The export_output function exports data from a VCF file to a specified output file in various formats, including VCF, CSV, TSV, PSV, and Parquet.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True, if output format is not VCF
  • query: The query parameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files.
  • threads: The threads parameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads
  • sort: The sort parameter is a boolean flag that determines whether the output file should be sorted or not. If sort is set to True, the output file will be sorted based on the genomic coordinates of the variants. By default, the value of sort is False, defaults to False
  • index: The index parameter is a boolean flag that determines whether an index should be created on the output file. If index is True, an index will be created. If index is False, no index will be created. The default value is False, defaults to False
  • order_by: The order_by parameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns

a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2288    def get_extra_infos(self, table: str = None) -> list:
2289        """
2290        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2291        in the header.
2292
2293        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2294        name of the table from which you want to retrieve the extra columns that are not present in the
2295        header. If the `table` parameter is not provided when calling the function, it will default to
2296        using the variants
2297        :type table: str
2298        :return: A list of columns that are in the specified table but not in the header of the table.
2299        """
2300
2301        header_columns = []
2302
2303        if not table:
2304            table = self.get_table_variants(clause="from")
2305            header_columns = self.get_header_columns()
2306
2307        # Check all columns in the database
2308        query = f""" SELECT * FROM {table} LIMIT 1 """
2309        log.debug(f"query {query}")
2310        table_columns = self.get_query_to_df(query).columns.tolist()
2311        extra_columns = []
2312
2313        # Construct extra infos (not in header)
2314        for column in table_columns:
2315            if column not in header_columns:
2316                extra_columns.append(column)
2317
2318        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2320    def get_extra_infos_sql(self, table: str = None) -> str:
2321        """
2322        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2323        by double quotes
2324
2325        :param table: The name of the table to get the extra infos from. If None, the default table is
2326        used
2327        :type table: str
2328        :return: A string of the extra infos
2329        """
2330
2331        return ", ".join(
2332            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2333        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2335    def export_header(
2336        self,
2337        header_name: str = None,
2338        output_file: str = None,
2339        output_file_ext: str = ".hdr",
2340        clean_header: bool = True,
2341        remove_chrom_line: bool = False,
2342    ) -> str:
2343        """
2344        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2345        specified options, and writes it to a new file.
2346
2347        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2348        this parameter is not specified, the header will be written to the output file
2349        :type header_name: str
2350        :param output_file: The `output_file` parameter in the `export_header` function is used to
2351        specify the name of the output file where the header will be written. If this parameter is not
2352        provided, the header will be written to a temporary file
2353        :type output_file: str
2354        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2355        string that represents the extension of the output header file. By default, it is set to ".hdr"
2356        if not specified by the user. This extension will be appended to the `output_file` name to
2357        create the final, defaults to .hdr
2358        :type output_file_ext: str (optional)
2359        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2360        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2361        `True`, the function will clean the header by modifying certain lines based on a specific
2362        pattern. If `clean_header`, defaults to True
2363        :type clean_header: bool (optional)
2364        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2365        boolean flag that determines whether the #CHROM line should be removed from the header before
2366        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2367        defaults to False
2368        :type remove_chrom_line: bool (optional)
2369        :return: The function `export_header` returns the name of the temporary header file that is
2370        created.
2371        """
2372
2373        if not header_name and not output_file:
2374            output_file = self.get_output()
2375
2376        if self.get_header():
2377
2378            # Get header object
2379            header_obj = self.get_header()
2380
2381            # Create database
2382            db_for_header = Database(database=self.get_input())
2383
2384            # Get real columns in the file
2385            db_header_columns = db_for_header.get_columns()
2386
2387            with tempfile.TemporaryDirectory() as tmpdir:
2388
2389                # Write header file
2390                header_file_tmp = os.path.join(tmpdir, "header")
2391                f = open(header_file_tmp, "w")
2392                vcf.Writer(f, header_obj)
2393                f.close()
2394
2395                # Replace #CHROM line with rel columns
2396                header_list = db_for_header.read_header_file(
2397                    header_file=header_file_tmp
2398                )
2399                header_list[-1] = "\t".join(db_header_columns)
2400
2401                # Remove CHROM line
2402                if remove_chrom_line:
2403                    header_list.pop()
2404
2405                # Clean header
2406                if clean_header:
2407                    header_list_clean = []
2408                    for head in header_list:
2409                        # Clean head for malformed header
2410                        head_clean = head
2411                        head_clean = re.subn(
2412                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2413                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2414                            head_clean,
2415                            2,
2416                        )[0]
2417                        # Write header
2418                        header_list_clean.append(head_clean)
2419                    header_list = header_list_clean
2420
2421            tmp_header_name = output_file + output_file_ext
2422
2423            f = open(tmp_header_name, "w")
2424            for line in header_list:
2425                f.write(line)
2426            f.close()
2427
2428        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2430    def export_variant_vcf(
2431        self,
2432        vcf_file,
2433        remove_info: bool = False,
2434        add_samples: bool = True,
2435        list_samples: list = [],
2436        where_clause: str = "",
2437        index: bool = False,
2438        threads: int | None = None,
2439    ) -> bool | None:
2440        """
2441        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2442        remove INFO field, add samples, and control compression and indexing.
2443
2444        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2445        written to. It is the output file that will contain the filtered VCF data based on the specified
2446        parameters
2447        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2448        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2449        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2450        in, defaults to False
2451        :type remove_info: bool (optional)
2452        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2453        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2454        If set to False, the samples will be removed. The default value is True, defaults to True
2455        :type add_samples: bool (optional)
2456        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2457        in the output VCF file. By default, all samples will be included. If you provide a list of
2458        samples, only those samples will be included in the output file
2459        :type list_samples: list
2460        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2461        determines whether or not to create an index for the output VCF file. If `index` is set to
2462        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2463        :type index: bool (optional)
2464        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2465        number of threads to use for exporting the VCF file. It determines how many parallel threads
2466        will be used during the export process. More threads can potentially speed up the export process
2467        by utilizing multiple cores of the processor. If
2468        :type threads: int | None
2469        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2470        method with various parameters including the output file, query, threads, sort flag, and index
2471        flag. The `export_output` method is responsible for exporting the VCF data based on the
2472        specified parameters and configurations provided in the `export_variant_vcf` function.
2473        """
2474
2475        # Config
2476        config = self.get_config()
2477
2478        # Extract VCF
2479        log.debug("Export VCF...")
2480
2481        # Table variants
2482        table_variants = self.get_table_variants()
2483
2484        # Threads
2485        if not threads:
2486            threads = self.get_threads()
2487
2488        # Info fields
2489        if remove_info:
2490            if not isinstance(remove_info, str):
2491                remove_info = "."
2492            info_field = f"""'{remove_info}' as INFO"""
2493        else:
2494            info_field = "INFO"
2495
2496        # Samples fields
2497        if add_samples:
2498            if not list_samples:
2499                list_samples = self.get_header_sample_list()
2500            if list_samples:
2501                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2502            else:
2503                samples_fields = ""
2504            log.debug(f"samples_fields: {samples_fields}")
2505        else:
2506            samples_fields = ""
2507
2508        # Where clause
2509        if where_clause is None:
2510            where_clause = ""
2511
2512        # Variants
2513        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2514        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2515        log.debug(f"sql_query_select={sql_query_select}")
2516
2517        return self.export_output(
2518            output_file=vcf_file,
2519            output_header=None,
2520            export_header=True,
2521            query=sql_query_select,
2522            parquet_partitions=None,
2523            chunk_size=config.get("chunk_size", None),
2524            threads=threads,
2525            sort=True,
2526            index=index,
2527            order_by=None,
2528        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2530    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2531        """
2532        It takes a list of commands and runs them in parallel using the number of threads specified
2533
2534        :param commands: A list of commands to run
2535        :param threads: The number of threads to use, defaults to 1 (optional)
2536        """
2537
2538        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2540    def get_threads(self, default: int = 1) -> int:
2541        """
2542        This function returns the number of threads to use for a job, with a default value of 1 if not
2543        specified.
2544
2545        :param default: The `default` parameter in the `get_threads` method is used to specify the
2546        default number of threads to use if no specific value is provided. If no value is provided for
2547        the `threads` parameter in the configuration or input parameters, the `default` value will be
2548        used, defaults to 1
2549        :type default: int (optional)
2550        :return: the number of threads to use for the current job.
2551        """
2552
2553        # Config
2554        config = self.get_config()
2555
2556        # Param
2557        param = self.get_param()
2558
2559        # Input threads
2560        input_thread = param.get("threads", config.get("threads", None))
2561
2562        # Check threads
2563        if not input_thread:
2564            threads = default
2565        elif int(input_thread) <= 0:
2566            threads = os.cpu_count()
2567        else:
2568            threads = int(input_thread)
2569        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2571    def get_memory(self, default: str = None) -> str:
2572        """
2573        This function retrieves the memory value from parameters or configuration with a default value
2574        if not found.
2575
2576        :param default: The `get_memory` function takes in a default value as a string parameter. This
2577        default value is used as a fallback in case the `memory` parameter is not provided in the
2578        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2579        the function
2580        :type default: str
2581        :return: The `get_memory` function returns a string value representing the memory parameter. If
2582        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2583        return the default value provided as an argument to the function.
2584        """
2585
2586        # Config
2587        config = self.get_config()
2588
2589        # Param
2590        param = self.get_param()
2591
2592        # Input threads
2593        input_memory = param.get("memory", config.get("memory", None))
2594
2595        # Check threads
2596        if input_memory:
2597            memory = input_memory
2598        else:
2599            memory = default
2600
2601        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2603    def update_from_vcf(self, vcf_file: str) -> None:
2604        """
2605        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2606
2607        :param vcf_file: the path to the VCF file
2608        """
2609
2610        connexion_format = self.get_connexion_format()
2611
2612        if connexion_format in ["duckdb"]:
2613            self.update_from_vcf_duckdb(vcf_file)
2614        elif connexion_format in ["sqlite"]:
2615            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2617    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2618        """
2619        It takes a VCF file and updates the INFO column of the variants table in the database with the
2620        INFO column of the VCF file
2621
2622        :param vcf_file: the path to the VCF file
2623        """
2624
2625        # varaints table
2626        table_variants = self.get_table_variants()
2627
2628        # Loading VCF into temporaire table
2629        skip = self.get_header_length(file=vcf_file)
2630        vcf_df = pd.read_csv(
2631            vcf_file,
2632            sep="\t",
2633            engine="c",
2634            skiprows=skip,
2635            header=0,
2636            low_memory=False,
2637        )
2638        sql_query_update = f"""
2639        UPDATE {table_variants} as table_variants
2640            SET INFO = concat(
2641                            CASE
2642                                WHEN INFO NOT IN ('', '.')
2643                                THEN INFO
2644                                ELSE ''
2645                            END,
2646                            (
2647                                SELECT 
2648                                    concat(
2649                                        CASE
2650                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2651                                            THEN ';'
2652                                            ELSE ''
2653                                        END
2654                                        ,
2655                                        CASE
2656                                            WHEN table_parquet.INFO NOT IN ('','.')
2657                                            THEN table_parquet.INFO
2658                                            ELSE ''
2659                                        END
2660                                    )
2661                                FROM vcf_df as table_parquet
2662                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2663                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2664                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2665                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2666                                        AND table_parquet.INFO NOT IN ('','.')
2667                            )
2668                        )
2669            ;
2670            """
2671        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2673    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2674        """
2675        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2676        table, then updates the INFO column of the variants table with the INFO column of the temporary
2677        table
2678
2679        :param vcf_file: The path to the VCF file you want to update the database with
2680        """
2681
2682        # Create a temporary table for the VCF
2683        table_vcf = "tmp_vcf"
2684        sql_create = (
2685            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2686        )
2687        self.conn.execute(sql_create)
2688
2689        # Loading VCF into temporaire table
2690        vcf_df = pd.read_csv(
2691            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2692        )
2693        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2694        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2695
2696        # Update table 'variants' with VCF data
2697        # warning: CONCAT as || operator
2698        sql_query_update = f"""
2699            UPDATE variants as table_variants
2700            SET INFO = CASE
2701                            WHEN INFO NOT IN ('', '.')
2702                            THEN INFO
2703                            ELSE ''
2704                        END ||
2705                        (
2706                        SELECT 
2707                            CASE 
2708                                WHEN table_variants.INFO NOT IN ('','.') 
2709                                    AND table_vcf.INFO NOT IN ('','.')  
2710                                THEN ';' 
2711                                ELSE '' 
2712                            END || 
2713                            CASE 
2714                                WHEN table_vcf.INFO NOT IN ('','.') 
2715                                THEN table_vcf.INFO 
2716                                ELSE '' 
2717                            END
2718                        FROM {table_vcf} as table_vcf
2719                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2720                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2721                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2722                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2723                        )
2724        """
2725        self.conn.execute(sql_query_update)
2726
2727        # Drop temporary table
2728        sql_drop = f"DROP TABLE {table_vcf}"
2729        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2731    def drop_variants_table(self) -> None:
2732        """
2733        > This function drops the variants table
2734        """
2735
2736        table_variants = self.get_table_variants()
2737        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2738        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2740    def set_variant_id(
2741        self, variant_id_column: str = "variant_id", force: bool = None
2742    ) -> str:
2743        """
2744        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2745        `#CHROM`, `POS`, `REF`, and `ALT` columns
2746
2747        :param variant_id_column: The name of the column to be created in the variants table, defaults
2748        to variant_id
2749        :type variant_id_column: str (optional)
2750        :param force: If True, the variant_id column will be created even if it already exists
2751        :type force: bool
2752        :return: The name of the column that contains the variant_id
2753        """
2754
2755        # Assembly
2756        assembly = self.get_param().get(
2757            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2758        )
2759
2760        # INFO/Tag prefix
2761        prefix = self.get_explode_infos_prefix()
2762
2763        # Explode INFO/SVTYPE
2764        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2765
2766        # variants table
2767        table_variants = self.get_table_variants()
2768
2769        # variant_id column
2770        if not variant_id_column:
2771            variant_id_column = "variant_id"
2772
2773        # Creta variant_id column
2774        if "variant_id" not in self.get_extra_infos() or force:
2775
2776            # Create column
2777            self.add_column(
2778                table_name=table_variants,
2779                column_name=variant_id_column,
2780                column_type="UBIGINT",
2781                default_value="0",
2782            )
2783
2784            # Update column
2785            self.conn.execute(
2786                f"""
2787                    UPDATE {table_variants}
2788                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2789                """
2790            )
2791
2792        # Remove added columns
2793        for added_column in added_columns:
2794            self.drop_column(column=added_column)
2795
2796        # return variant_id column name
2797        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2799    def get_variant_id_column(
2800        self, variant_id_column: str = "variant_id", force: bool = None
2801    ) -> str:
2802        """
2803        This function returns the variant_id column name
2804
2805        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2806        defaults to variant_id
2807        :type variant_id_column: str (optional)
2808        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2809        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2810        if it is not already set, or if it is set
2811        :type force: bool
2812        :return: The variant_id column name.
2813        """
2814
2815        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2821    def scan_databases(
2822        self,
2823        database_formats: list = ["parquet"],
2824        database_releases: list = ["current"],
2825    ) -> dict:
2826        """
2827        The function `scan_databases` scans for available databases based on specified formats and
2828        releases.
2829
2830        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2831        of the databases to be scanned. In this case, the accepted format is "parquet"
2832        :type database_formats: list ["parquet"]
2833        :param database_releases: The `database_releases` parameter is a list that specifies the
2834        releases of the databases to be scanned. In the provided function, the default value for
2835        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2836        databases that are in the "current"
2837        :type database_releases: list
2838        :return: The function `scan_databases` returns a dictionary containing information about
2839        databases that match the specified formats and releases.
2840        """
2841
2842        # Config
2843        config = self.get_config()
2844
2845        # Param
2846        param = self.get_param()
2847
2848        # Param - Assembly
2849        assembly = param.get("assembly", config.get("assembly", None))
2850        if not assembly:
2851            assembly = DEFAULT_ASSEMBLY
2852            log.warning(f"Default assembly '{assembly}'")
2853
2854        # Scan for availabled databases
2855        log.info(
2856            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2857        )
2858        databases_infos_dict = databases_infos(
2859            database_folder_releases=database_releases,
2860            database_formats=database_formats,
2861            assembly=assembly,
2862            config=config,
2863        )
2864        log.info(
2865            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2866        )
2867
2868        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2870    def annotation(self) -> None:
2871        """
2872        It annotates the VCF file with the annotations specified in the config file.
2873        """
2874
2875        # Config
2876        config = self.get_config()
2877
2878        # Param
2879        param = self.get_param()
2880
2881        # Param - Assembly
2882        assembly = param.get("assembly", config.get("assembly", None))
2883        if not assembly:
2884            assembly = DEFAULT_ASSEMBLY
2885            log.warning(f"Default assembly '{assembly}'")
2886
2887        # annotations databases folders
2888        annotations_databases = set(
2889            config.get("folders", {})
2890            .get("databases", {})
2891            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2892            + config.get("folders", {})
2893            .get("databases", {})
2894            .get("parquet", ["~/howard/databases/parquet/current"])
2895            + config.get("folders", {})
2896            .get("databases", {})
2897            .get("bcftools", ["~/howard/databases/bcftools/current"])
2898        )
2899
2900        # Get param annotations
2901        if param.get("annotations", None) and isinstance(
2902            param.get("annotations", None), str
2903        ):
2904            log.debug(param.get("annotations", None))
2905            param_annotation_list = param.get("annotations").split(",")
2906        else:
2907            param_annotation_list = []
2908
2909        # Each tools param
2910        if param.get("annotation_parquet", None) != None:
2911            log.debug(
2912                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2913            )
2914            if isinstance(param.get("annotation_parquet", None), list):
2915                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2916            else:
2917                param_annotation_list.append(param.get("annotation_parquet"))
2918        if param.get("annotation_snpsift", None) != None:
2919            if isinstance(param.get("annotation_snpsift", None), list):
2920                param_annotation_list.append(
2921                    "snpsift:"
2922                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2923                )
2924            else:
2925                param_annotation_list.append(
2926                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2927                )
2928        if param.get("annotation_snpeff", None) != None:
2929            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2930        if param.get("annotation_bcftools", None) != None:
2931            if isinstance(param.get("annotation_bcftools", None), list):
2932                param_annotation_list.append(
2933                    "bcftools:"
2934                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2935                )
2936            else:
2937                param_annotation_list.append(
2938                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2939                )
2940        if param.get("annotation_annovar", None) != None:
2941            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2942        if param.get("annotation_exomiser", None) != None:
2943            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2944        if param.get("annotation_splice", None) != None:
2945            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2946
2947        # Merge param annotations list
2948        param["annotations"] = ",".join(param_annotation_list)
2949
2950        # debug
2951        log.debug(f"param_annotations={param['annotations']}")
2952
2953        if param.get("annotations"):
2954
2955            # Log
2956            # log.info("Annotations - Check annotation parameters")
2957
2958            if not "annotation" in param:
2959                param["annotation"] = {}
2960
2961            # List of annotations parameters
2962            annotations_list_input = {}
2963            if isinstance(param.get("annotations", None), str):
2964                annotation_file_list = [
2965                    value for value in param.get("annotations", "").split(",")
2966                ]
2967                for annotation_file in annotation_file_list:
2968                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
2969            else:
2970                annotations_list_input = param.get("annotations", {})
2971
2972            log.info(f"Quick Annotations:")
2973            for annotation_key in list(annotations_list_input.keys()):
2974                log.info(f"   {annotation_key}")
2975
2976            # List of annotations and associated fields
2977            annotations_list = {}
2978
2979            for annotation_file in annotations_list_input:
2980
2981                # Explode annotations if ALL
2982                if (
2983                    annotation_file.upper() == "ALL"
2984                    or annotation_file.upper().startswith("ALL:")
2985                ):
2986
2987                    # check ALL parameters (formats, releases)
2988                    annotation_file_split = annotation_file.split(":")
2989                    database_formats = "parquet"
2990                    database_releases = "current"
2991                    for annotation_file_option in annotation_file_split[1:]:
2992                        database_all_options_split = annotation_file_option.split("=")
2993                        if database_all_options_split[0] == "format":
2994                            database_formats = database_all_options_split[1].split("+")
2995                        if database_all_options_split[0] == "release":
2996                            database_releases = database_all_options_split[1].split("+")
2997
2998                    # Scan for availabled databases
2999                    databases_infos_dict = self.scan_databases(
3000                        database_formats=database_formats,
3001                        database_releases=database_releases,
3002                    )
3003
3004                    # Add found databases in annotation parameters
3005                    for database_infos in databases_infos_dict.keys():
3006                        annotations_list[database_infos] = {"INFO": None}
3007
3008                else:
3009                    annotations_list[annotation_file] = annotations_list_input[
3010                        annotation_file
3011                    ]
3012
3013            # Check each databases
3014            if len(annotations_list):
3015
3016                log.info(
3017                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
3018                )
3019
3020                for annotation_file in annotations_list:
3021
3022                    # Init
3023                    annotations = annotations_list.get(annotation_file, None)
3024
3025                    # Annotation snpEff
3026                    if annotation_file.startswith("snpeff"):
3027
3028                        log.debug(f"Quick Annotation snpEff")
3029
3030                        if "snpeff" not in param["annotation"]:
3031                            param["annotation"]["snpeff"] = {}
3032
3033                        if "options" not in param["annotation"]["snpeff"]:
3034                            param["annotation"]["snpeff"]["options"] = ""
3035
3036                        # snpEff options in annotations
3037                        param["annotation"]["snpeff"]["options"] = "".join(
3038                            annotation_file.split(":")[1:]
3039                        )
3040
3041                    # Annotation Annovar
3042                    elif annotation_file.startswith("annovar"):
3043
3044                        log.debug(f"Quick Annotation Annovar")
3045
3046                        if "annovar" not in param["annotation"]:
3047                            param["annotation"]["annovar"] = {}
3048
3049                        if "annotations" not in param["annotation"]["annovar"]:
3050                            param["annotation"]["annovar"]["annotations"] = {}
3051
3052                        # Options
3053                        annotation_file_split = annotation_file.split(":")
3054                        for annotation_file_annotation in annotation_file_split[1:]:
3055                            if annotation_file_annotation:
3056                                param["annotation"]["annovar"]["annotations"][
3057                                    annotation_file_annotation
3058                                ] = annotations
3059
3060                    # Annotation Exomiser
3061                    elif annotation_file.startswith("exomiser"):
3062
3063                        log.debug(f"Quick Annotation Exomiser")
3064
3065                        param["annotation"]["exomiser"] = params_string_to_dict(
3066                            annotation_file
3067                        )
3068
3069                    # Annotation Splice
3070                    elif annotation_file.startswith("splice"):
3071
3072                        log.debug(f"Quick Annotation Splice")
3073
3074                        param["annotation"]["splice"] = params_string_to_dict(
3075                            annotation_file
3076                        )
3077
3078                    # Annotation Parquet or BCFTOOLS
3079                    else:
3080
3081                        # Tools detection
3082                        if annotation_file.startswith("bcftools:"):
3083                            annotation_tool_initial = "bcftools"
3084                            annotation_file = ":".join(annotation_file.split(":")[1:])
3085                        elif annotation_file.startswith("snpsift:"):
3086                            annotation_tool_initial = "snpsift"
3087                            annotation_file = ":".join(annotation_file.split(":")[1:])
3088                        elif annotation_file.startswith("bigwig:"):
3089                            annotation_tool_initial = "bigwig"
3090                            annotation_file = ":".join(annotation_file.split(":")[1:])
3091                        else:
3092                            annotation_tool_initial = None
3093
3094                        # list of files
3095                        annotation_file_list = annotation_file.replace("+", ":").split(
3096                            ":"
3097                        )
3098
3099                        for annotation_file in annotation_file_list:
3100
3101                            if annotation_file:
3102
3103                                # Annotation tool initial
3104                                annotation_tool = annotation_tool_initial
3105
3106                                # Find file
3107                                annotation_file_found = None
3108
3109                                if os.path.exists(annotation_file):
3110                                    annotation_file_found = annotation_file
3111                                elif os.path.exists(full_path(annotation_file)):
3112                                    annotation_file_found = full_path(annotation_file)
3113                                else:
3114                                    # Find within assembly folders
3115                                    for annotations_database in annotations_databases:
3116                                        found_files = find_all(
3117                                            annotation_file,
3118                                            os.path.join(
3119                                                annotations_database, assembly
3120                                            ),
3121                                        )
3122                                        if len(found_files) > 0:
3123                                            annotation_file_found = found_files[0]
3124                                            break
3125                                    if not annotation_file_found and not assembly:
3126                                        # Find within folders
3127                                        for (
3128                                            annotations_database
3129                                        ) in annotations_databases:
3130                                            found_files = find_all(
3131                                                annotation_file, annotations_database
3132                                            )
3133                                            if len(found_files) > 0:
3134                                                annotation_file_found = found_files[0]
3135                                                break
3136                                log.debug(
3137                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
3138                                )
3139
3140                                # Full path
3141                                annotation_file_found = full_path(annotation_file_found)
3142
3143                                if annotation_file_found:
3144
3145                                    database = Database(database=annotation_file_found)
3146                                    quick_annotation_format = database.get_format()
3147                                    quick_annotation_is_compressed = (
3148                                        database.is_compressed()
3149                                    )
3150                                    quick_annotation_is_indexed = os.path.exists(
3151                                        f"{annotation_file_found}.tbi"
3152                                    )
3153                                    bcftools_preference = False
3154
3155                                    # Check Annotation Tool
3156                                    if not annotation_tool:
3157                                        if (
3158                                            bcftools_preference
3159                                            and quick_annotation_format
3160                                            in ["vcf", "bed"]
3161                                            and quick_annotation_is_compressed
3162                                            and quick_annotation_is_indexed
3163                                        ):
3164                                            annotation_tool = "bcftools"
3165                                        elif quick_annotation_format in [
3166                                            "vcf",
3167                                            "bed",
3168                                            "tsv",
3169                                            "tsv",
3170                                            "csv",
3171                                            "json",
3172                                            "tbl",
3173                                            "parquet",
3174                                            "duckdb",
3175                                        ]:
3176                                            annotation_tool = "parquet"
3177                                        elif quick_annotation_format in [
3178                                            "bw"
3179                                        ]:
3180                                            annotation_tool = "bigwig"
3181                                        else:
3182                                            log.error(
3183                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3184                                            )
3185                                            raise ValueError(
3186                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3187                                            )
3188
3189                                    log.debug(
3190                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3191                                    )
3192
3193                                    # Annotation Tool dispatch
3194                                    if annotation_tool:
3195                                        if annotation_tool not in param["annotation"]:
3196                                            param["annotation"][annotation_tool] = {}
3197                                        if (
3198                                            "annotations"
3199                                            not in param["annotation"][annotation_tool]
3200                                        ):
3201                                            param["annotation"][annotation_tool][
3202                                                "annotations"
3203                                            ] = {}
3204                                        param["annotation"][annotation_tool][
3205                                            "annotations"
3206                                        ][annotation_file_found] = annotations
3207
3208                                else:
3209                                    log.warning(
3210                                        f"Quick Annotation File {annotation_file} does NOT exist"
3211                                    )
3212
3213                self.set_param(param)
3214
3215        if param.get("annotation", None):
3216            log.info("Annotations")
3217            if param.get("annotation", {}).get("parquet", None):
3218                log.info("Annotations 'parquet'...")
3219                self.annotation_parquet()
3220            if param.get("annotation", {}).get("bcftools", None):
3221                log.info("Annotations 'bcftools'...")
3222                self.annotation_bcftools()
3223            if param.get("annotation", {}).get("snpsift", None):
3224                log.info("Annotations 'snpsift'...")
3225                self.annotation_snpsift()
3226            if param.get("annotation", {}).get("bigwig", None):
3227                log.info("Annotations 'bigwig'...")
3228                self.annotation_bigwig()
3229            if param.get("annotation", {}).get("annovar", None):
3230                log.info("Annotations 'annovar'...")
3231                self.annotation_annovar()
3232            if param.get("annotation", {}).get("snpeff", None):
3233                log.info("Annotations 'snpeff'...")
3234                self.annotation_snpeff()
3235            if param.get("annotation", {}).get("exomiser", None) is not None:
3236                log.info("Annotations 'exomiser'...")
3237                self.annotation_exomiser()
3238            if param.get("annotation", {}).get("splice", None) is not None:
3239                log.info("Annotations 'splice' ...")
3240                self.annotation_splice()
3241
3242        # Explode INFOS fields into table fields
3243        if self.get_explode_infos():
3244            self.explode_infos(
3245                prefix=self.get_explode_infos_prefix(),
3246                fields=self.get_explode_infos_fields(),
3247                force=True,
3248            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_bigwig(self, threads: int = None) -> None:
3251    def annotation_bigwig(self, threads: int = None) -> None:
3252        """
3253        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
3254        
3255        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
3256        number of threads to be used for parallel processing during the annotation process. If the
3257        `threads` parameter is not provided, the method will attempt to determine the optimal number of
3258        threads to use based on the system configuration
3259        :type threads: int
3260        :return: True
3261        """
3262
3263        # DEBUG
3264        log.debug("Start annotation with bigwig databases")
3265
3266        # # Threads
3267        # if not threads:
3268        #     threads = self.get_threads()
3269        # log.debug("Threads: " + str(threads))
3270
3271        # Config
3272        config = self.get_config()
3273        log.debug("Config: " + str(config))
3274
3275        # Config - BCFTools databases folders
3276        databases_folders = set(
3277            self.get_config()
3278            .get("folders", {})
3279            .get("databases", {})
3280            .get("annotations", ["."])
3281            + self.get_config()
3282            .get("folders", {})
3283            .get("databases", {})
3284            .get("bigwig", ["."])
3285        )
3286        log.debug("Databases annotations: " + str(databases_folders))
3287
3288        # Param
3289        annotations = (
3290            self.get_param()
3291            .get("annotation", {})
3292            .get("bigwig", {})
3293            .get("annotations", None)
3294        )
3295        log.debug("Annotations: " + str(annotations))
3296
3297        # Assembly
3298        assembly = self.get_param().get(
3299            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3300        )
3301
3302        # Data
3303        table_variants = self.get_table_variants()
3304
3305        # Check if not empty
3306        log.debug("Check if not empty")
3307        sql_query_chromosomes = (
3308            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3309        )
3310        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3311        if not sql_query_chromosomes_df["count"][0]:
3312            log.info(f"VCF empty")
3313            return
3314
3315        # VCF header
3316        vcf_reader = self.get_header()
3317        log.debug("Initial header: " + str(vcf_reader.infos))
3318
3319        # Existing annotations
3320        for vcf_annotation in self.get_header().infos:
3321
3322            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3323            log.debug(
3324                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3325            )
3326
3327        if annotations:
3328
3329            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3330
3331                # Export VCF file
3332                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3333
3334                # annotation_bigwig_config
3335                annotation_bigwig_config_list = []
3336
3337                for annotation in annotations:
3338                    annotation_fields = annotations[annotation]
3339
3340                    # Annotation Name
3341                    annotation_name = os.path.basename(annotation)
3342
3343                    if not annotation_fields:
3344                        annotation_fields = {"INFO": None}
3345
3346                    log.debug(f"Annotation '{annotation_name}'")
3347                    log.debug(
3348                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3349                    )
3350
3351                    # Create Database
3352                    database = Database(
3353                        database=annotation,
3354                        databases_folders=databases_folders,
3355                        assembly=assembly,
3356                    )
3357
3358                    # Find files
3359                    db_file = database.get_database()
3360                    db_file = full_path(db_file)
3361                    db_hdr_file = database.get_header_file()
3362                    db_hdr_file = full_path(db_hdr_file)
3363                    db_file_type = database.get_format()
3364
3365                    # If db_file is http ?
3366                    if database.get_database().startswith("http"):
3367
3368                        # Datbase is HTTP URL
3369                        db_file_is_http = True
3370
3371                        # DB file keep as URL
3372                        db_file = database.get_database()
3373                        log.warning(f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)")
3374
3375                        # Retrieve automatic annotation field name
3376                        annotation_field = clean_annotation_field(os.path.basename(db_file).replace(".bw", ""))
3377                        log.debug(f"Create header file with annotation field '{annotation_field}' is an HTTP URL")
3378
3379                        # Create automatic header file
3380                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
3381                        with open(db_hdr_file, 'w') as f:
3382                            f.write("##fileformat=VCFv4.2\n")
3383                            f.write(f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""")
3384                            f.write(f"#CHROM	START	END	{annotation_field}\n")
3385
3386                    else:
3387
3388                        # Datbase is NOT HTTP URL
3389                        db_file_is_http = False
3390                    
3391
3392                    # Check index - try to create if not exists
3393                    if db_file is None or db_hdr_file is None or (not os.path.exists(db_file) and not db_file_is_http) or not os.path.exists(db_hdr_file) or not db_file_type in ["bw"]:
3394                    #if False:
3395                        log.error("Annotation failed: database not valid")
3396                        log.error(f"Annotation annotation file: {db_file}")
3397                        log.error(f"Annotation annotation file type: {db_file_type}")
3398                        log.error(f"Annotation annotation header: {db_hdr_file}")
3399                        raise ValueError(
3400                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
3401                        )
3402                    else:
3403
3404                        # Log
3405                        log.debug(
3406                            f"Annotation '{annotation}' - file: "
3407                            + str(db_file)
3408                            + " and "
3409                            + str(db_hdr_file)
3410                        )
3411
3412                        # Load header as VCF object
3413                        db_hdr_vcf = Variants(input=db_hdr_file)
3414                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3415                        log.debug(
3416                            "Annotation database header: "
3417                            + str(db_hdr_vcf_header_infos)
3418                        )
3419
3420                        # For all fields in database
3421                        annotation_fields_full = False
3422                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3423                            annotation_fields = {
3424                                key: key for key in db_hdr_vcf_header_infos
3425                            }
3426                            log.debug(
3427                                "Annotation database header - All annotations added: "
3428                                + str(annotation_fields)
3429                            )
3430                            annotation_fields_full = True
3431
3432                        # Init
3433                        cyvcf2_header_rename_dict = {}
3434                        cyvcf2_header_list = []
3435                        cyvcf2_header_indexes = {}
3436
3437                        # process annotation fields
3438                        for annotation_field in annotation_fields:
3439
3440                            # New annotation name 
3441                            annotation_field_new = annotation_fields[annotation_field]
3442
3443                            # Check annotation field and index in header
3444                            if annotation_field in db_hdr_vcf.get_header_columns_as_list():
3445                                annotation_field_index = db_hdr_vcf.get_header_columns_as_list().index(annotation_field)-3
3446                                cyvcf2_header_indexes[annotation_field_new] = annotation_field_index
3447                            else:
3448                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
3449                                log.error(msg_err)
3450                                raise ValueError(msg_err)
3451
3452                            # Append annotation field in cyvcf2 header list
3453                            cyvcf2_header_rename_dict[annotation_field_new] = db_hdr_vcf_header_infos[annotation_field].id
3454                            cyvcf2_header_list.append(
3455                                {
3456                                    "ID": annotation_field_new,
3457                                    "Number": db_hdr_vcf_header_infos[annotation_field].num,
3458                                    "Type": db_hdr_vcf_header_infos[annotation_field].type,
3459                                    "Description": db_hdr_vcf_header_infos[annotation_field].desc,
3460                                }
3461                            )
3462
3463                        # Load bigwig database
3464                        bw_db = pyBigWig.open(db_file)
3465                        if bw_db.isBigWig():
3466                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
3467                        else:
3468                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
3469                            log.error(msg_err)
3470                            raise ValueError(msg_err)
3471
3472                        annotation_bigwig_config_list.append(
3473                            {
3474                                "db_file": db_file,
3475                                "bw_db": bw_db,
3476                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
3477                                "cyvcf2_header_list": cyvcf2_header_list,
3478                                "cyvcf2_header_indexes": cyvcf2_header_indexes
3479                            }
3480                        )
3481
3482                # Annotate
3483                if annotation_bigwig_config_list:
3484
3485                    # Annotation config
3486                    log.debug(f"annotation_bigwig_config={annotation_bigwig_config_list}")
3487
3488                    # Export VCF file
3489                    self.export_variant_vcf(
3490                        vcf_file=tmp_vcf_name,
3491                        remove_info=True,
3492                        add_samples=False,
3493                        index=True,
3494                    )
3495
3496                    # Load input tmp file
3497                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
3498
3499                    # Add header in input file
3500                    for annotation_bigwig_config in annotation_bigwig_config_list:
3501                        for cyvcf2_header_field in annotation_bigwig_config.get("cyvcf2_header_list",[]):
3502                            log.info(f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'")
3503                            input_vcf.add_info_to_header(
3504                                cyvcf2_header_field
3505                            )
3506
3507                    # Create output VCF file
3508                    output_vcf_file = os.path.join(tmp_dir,"output.vcf.gz")
3509                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
3510
3511                    # Fetch variants
3512                    log.info(f"Annotations 'bigwig' start...")
3513                    for variant in input_vcf:
3514
3515                        for annotation_bigwig_config in annotation_bigwig_config_list:
3516
3517                            # DB and indexes
3518                            bw_db = annotation_bigwig_config.get("bw_db", None)
3519                            cyvcf2_header_indexes = annotation_bigwig_config.get("cyvcf2_header_indexes", None)
3520
3521                            # Retrieve value from chrom pos
3522                            res = bw_db.values(variant.CHROM, variant.POS - 1, variant.POS)
3523                            
3524                            # For each annotation fields (and indexes)
3525                            for cyvcf2_header_index in cyvcf2_header_indexes:
3526
3527                                # If value is NOT nNone
3528                                if not np.isnan(res[cyvcf2_header_indexes[cyvcf2_header_index]]):
3529                                    variant.INFO[cyvcf2_header_index] = res[cyvcf2_header_indexes[cyvcf2_header_index]]
3530
3531                        # Add record in output file
3532                        output_vcf.write_record(variant)
3533
3534                    # Log
3535                    log.debug(f"Annotation done.")
3536
3537                    # Close and write file
3538                    log.info(f"Annotations 'bigwig' write...")
3539                    output_vcf.close()
3540                    log.debug(f"Write done.")
3541
3542                    # Update variants
3543                    log.info(f"Annotations 'bigwig' update...")
3544                    self.update_from_vcf(output_vcf_file)
3545                    log.debug(f"Update done.")
3546
3547        return True

The function annotation_bigwig annotates variants in a VCF file using bigwig databases.

Parameters
  • threads: The threads parameter in the annotation_bigwig method is used to specify the number of threads to be used for parallel processing during the annotation process. If the threads parameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns

True

def annotation_snpsift(self, threads: int = None) -> None:
3550    def annotation_snpsift(self, threads: int = None) -> None:
3551        """
3552        This function annotate with bcftools
3553
3554        :param threads: Number of threads to use
3555        :return: the value of the variable "return_value".
3556        """
3557
3558        # DEBUG
3559        log.debug("Start annotation with bcftools databases")
3560
3561        # Threads
3562        if not threads:
3563            threads = self.get_threads()
3564        log.debug("Threads: " + str(threads))
3565
3566        # Config
3567        config = self.get_config()
3568        log.debug("Config: " + str(config))
3569
3570        # Config - snpSift
3571        snpsift_bin_command = get_bin_command(
3572            bin="SnpSift.jar",
3573            tool="snpsift",
3574            bin_type="jar",
3575            config=config,
3576            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3577        )
3578        if not snpsift_bin_command:
3579            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3580            log.error(msg_err)
3581            raise ValueError(msg_err)
3582
3583        # Config - bcftools
3584        bcftools_bin_command = get_bin_command(
3585            bin="bcftools",
3586            tool="bcftools",
3587            bin_type="bin",
3588            config=config,
3589            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3590        )
3591        if not bcftools_bin_command:
3592            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3593            log.error(msg_err)
3594            raise ValueError(msg_err)
3595
3596        # Config - BCFTools databases folders
3597        databases_folders = set(
3598            self.get_config()
3599            .get("folders", {})
3600            .get("databases", {})
3601            .get("annotations", ["."])
3602            + self.get_config()
3603            .get("folders", {})
3604            .get("databases", {})
3605            .get("bcftools", ["."])
3606        )
3607        log.debug("Databases annotations: " + str(databases_folders))
3608
3609        # Param
3610        annotations = (
3611            self.get_param()
3612            .get("annotation", {})
3613            .get("snpsift", {})
3614            .get("annotations", None)
3615        )
3616        log.debug("Annotations: " + str(annotations))
3617
3618        # Assembly
3619        assembly = self.get_param().get(
3620            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3621        )
3622
3623        # Data
3624        table_variants = self.get_table_variants()
3625
3626        # Check if not empty
3627        log.debug("Check if not empty")
3628        sql_query_chromosomes = (
3629            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3630        )
3631        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3632        if not sql_query_chromosomes_df["count"][0]:
3633            log.info(f"VCF empty")
3634            return
3635
3636        # VCF header
3637        vcf_reader = self.get_header()
3638        log.debug("Initial header: " + str(vcf_reader.infos))
3639
3640        # Existing annotations
3641        for vcf_annotation in self.get_header().infos:
3642
3643            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3644            log.debug(
3645                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3646            )
3647
3648        if annotations:
3649
3650            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3651
3652                # Export VCF file
3653                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3654
3655                # Init
3656                commands = {}
3657
3658                for annotation in annotations:
3659                    annotation_fields = annotations[annotation]
3660
3661                    # Annotation Name
3662                    annotation_name = os.path.basename(annotation)
3663
3664                    if not annotation_fields:
3665                        annotation_fields = {"INFO": None}
3666
3667                    log.debug(f"Annotation '{annotation_name}'")
3668                    log.debug(
3669                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3670                    )
3671
3672                    # Create Database
3673                    database = Database(
3674                        database=annotation,
3675                        databases_folders=databases_folders,
3676                        assembly=assembly,
3677                    )
3678
3679                    # Find files
3680                    db_file = database.get_database()
3681                    db_file = full_path(db_file)
3682                    db_hdr_file = database.get_header_file()
3683                    db_hdr_file = full_path(db_hdr_file)
3684                    db_file_type = database.get_format()
3685                    db_tbi_file = f"{db_file}.tbi"
3686                    db_file_compressed = database.is_compressed()
3687
3688                    # Check if compressed
3689                    if not db_file_compressed:
3690                        log.error(
3691                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3692                        )
3693                        raise ValueError(
3694                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3695                        )
3696
3697                    # Check if indexed
3698                    if not os.path.exists(db_tbi_file):
3699                        log.error(
3700                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3701                        )
3702                        raise ValueError(
3703                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3704                        )
3705
3706                    # Check index - try to create if not exists
3707                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3708                        log.error("Annotation failed: database not valid")
3709                        log.error(f"Annotation annotation file: {db_file}")
3710                        log.error(f"Annotation annotation header: {db_hdr_file}")
3711                        log.error(f"Annotation annotation index: {db_tbi_file}")
3712                        raise ValueError(
3713                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3714                        )
3715                    else:
3716
3717                        log.debug(
3718                            f"Annotation '{annotation}' - file: "
3719                            + str(db_file)
3720                            + " and "
3721                            + str(db_hdr_file)
3722                        )
3723
3724                        # Load header as VCF object
3725                        db_hdr_vcf = Variants(input=db_hdr_file)
3726                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3727                        log.debug(
3728                            "Annotation database header: "
3729                            + str(db_hdr_vcf_header_infos)
3730                        )
3731
3732                        # For all fields in database
3733                        annotation_fields_full = False
3734                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3735                            annotation_fields = {
3736                                key: key for key in db_hdr_vcf_header_infos
3737                            }
3738                            log.debug(
3739                                "Annotation database header - All annotations added: "
3740                                + str(annotation_fields)
3741                            )
3742                            annotation_fields_full = True
3743
3744                        # # Create file for field rename
3745                        # log.debug("Create file for field rename")
3746                        # tmp_rename = NamedTemporaryFile(
3747                        #     prefix=self.get_prefix(),
3748                        #     dir=self.get_tmp_dir(),
3749                        #     suffix=".rename",
3750                        #     delete=False,
3751                        # )
3752                        # tmp_rename_name = tmp_rename.name
3753                        # tmp_files.append(tmp_rename_name)
3754
3755                        # Number of fields
3756                        nb_annotation_field = 0
3757                        annotation_list = []
3758                        annotation_infos_rename_list = []
3759
3760                        for annotation_field in annotation_fields:
3761
3762                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3763                            annotation_fields_new_name = annotation_fields.get(
3764                                annotation_field, annotation_field
3765                            )
3766                            if not annotation_fields_new_name:
3767                                annotation_fields_new_name = annotation_field
3768
3769                            # Check if field is in DB and if field is not elready in input data
3770                            if (
3771                                annotation_field in db_hdr_vcf.get_header().infos
3772                                and annotation_fields_new_name
3773                                not in self.get_header().infos
3774                            ):
3775
3776                                log.info(
3777                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3778                                )
3779
3780                                # BCFTools annotate param to rename fields
3781                                if annotation_field != annotation_fields_new_name:
3782                                    annotation_infos_rename_list.append(
3783                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3784                                    )
3785
3786                                # Add INFO field to header
3787                                db_hdr_vcf_header_infos_number = (
3788                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3789                                )
3790                                db_hdr_vcf_header_infos_type = (
3791                                    db_hdr_vcf_header_infos[annotation_field].type
3792                                    or "String"
3793                                )
3794                                db_hdr_vcf_header_infos_description = (
3795                                    db_hdr_vcf_header_infos[annotation_field].desc
3796                                    or f"{annotation_field} description"
3797                                )
3798                                db_hdr_vcf_header_infos_source = (
3799                                    db_hdr_vcf_header_infos[annotation_field].source
3800                                    or "unknown"
3801                                )
3802                                db_hdr_vcf_header_infos_version = (
3803                                    db_hdr_vcf_header_infos[annotation_field].version
3804                                    or "unknown"
3805                                )
3806
3807                                vcf_reader.infos[annotation_fields_new_name] = (
3808                                    vcf.parser._Info(
3809                                        annotation_fields_new_name,
3810                                        db_hdr_vcf_header_infos_number,
3811                                        db_hdr_vcf_header_infos_type,
3812                                        db_hdr_vcf_header_infos_description,
3813                                        db_hdr_vcf_header_infos_source,
3814                                        db_hdr_vcf_header_infos_version,
3815                                        self.code_type_map[
3816                                            db_hdr_vcf_header_infos_type
3817                                        ],
3818                                    )
3819                                )
3820
3821                                annotation_list.append(annotation_field)
3822
3823                                nb_annotation_field += 1
3824
3825                            else:
3826
3827                                if (
3828                                    annotation_field
3829                                    not in db_hdr_vcf.get_header().infos
3830                                ):
3831                                    log.warning(
3832                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3833                                    )
3834                                if (
3835                                    annotation_fields_new_name
3836                                    in self.get_header().infos
3837                                ):
3838                                    log.warning(
3839                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3840                                    )
3841
3842                        log.info(
3843                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3844                        )
3845
3846                        annotation_infos = ",".join(annotation_list)
3847
3848                        if annotation_infos != "":
3849
3850                            # Annotated VCF (and error file)
3851                            tmp_annotation_vcf_name = os.path.join(
3852                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3853                            )
3854                            tmp_annotation_vcf_name_err = (
3855                                tmp_annotation_vcf_name + ".err"
3856                            )
3857
3858                            # Add fields to annotate
3859                            if not annotation_fields_full:
3860                                annotation_infos_option = f"-info {annotation_infos}"
3861                            else:
3862                                annotation_infos_option = ""
3863
3864                            # Info fields rename
3865                            if annotation_infos_rename_list:
3866                                annotation_infos_rename = " -c " + ",".join(
3867                                    annotation_infos_rename_list
3868                                )
3869                            else:
3870                                annotation_infos_rename = ""
3871
3872                            # Annotate command
3873                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3874
3875                            # Add command
3876                            commands[command_annotate] = tmp_annotation_vcf_name
3877
3878                if commands:
3879
3880                    # Export VCF file
3881                    self.export_variant_vcf(
3882                        vcf_file=tmp_vcf_name,
3883                        remove_info=True,
3884                        add_samples=False,
3885                        index=True,
3886                    )
3887                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3888
3889                    # Num command
3890                    nb_command = 0
3891
3892                    # Annotate
3893                    for command_annotate in commands:
3894                        nb_command += 1
3895                        log.info(
3896                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3897                        )
3898                        log.debug(f"command_annotate={command_annotate}")
3899                        run_parallel_commands([command_annotate], threads)
3900
3901                        # Debug
3902                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3903
3904                        # Update variants
3905                        log.info(
3906                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3907                        )
3908                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3911    def annotation_bcftools(self, threads: int = None) -> None:
3912        """
3913        This function annotate with bcftools
3914
3915        :param threads: Number of threads to use
3916        :return: the value of the variable "return_value".
3917        """
3918
3919        # DEBUG
3920        log.debug("Start annotation with bcftools databases")
3921
3922        # Threads
3923        if not threads:
3924            threads = self.get_threads()
3925        log.debug("Threads: " + str(threads))
3926
3927        # Config
3928        config = self.get_config()
3929        log.debug("Config: " + str(config))
3930
3931        # DEBUG
3932        delete_tmp = True
3933        if self.get_config().get("verbosity", "warning") in ["debug"]:
3934            delete_tmp = False
3935            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3936
3937        # Config - BCFTools bin command
3938        bcftools_bin_command = get_bin_command(
3939            bin="bcftools",
3940            tool="bcftools",
3941            bin_type="bin",
3942            config=config,
3943            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3944        )
3945        if not bcftools_bin_command:
3946            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3947            log.error(msg_err)
3948            raise ValueError(msg_err)
3949
3950        # Config - BCFTools databases folders
3951        databases_folders = set(
3952            self.get_config()
3953            .get("folders", {})
3954            .get("databases", {})
3955            .get("annotations", ["."])
3956            + self.get_config()
3957            .get("folders", {})
3958            .get("databases", {})
3959            .get("bcftools", ["."])
3960        )
3961        log.debug("Databases annotations: " + str(databases_folders))
3962
3963        # Param
3964        annotations = (
3965            self.get_param()
3966            .get("annotation", {})
3967            .get("bcftools", {})
3968            .get("annotations", None)
3969        )
3970        log.debug("Annotations: " + str(annotations))
3971
3972        # Assembly
3973        assembly = self.get_param().get(
3974            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3975        )
3976
3977        # Data
3978        table_variants = self.get_table_variants()
3979
3980        # Check if not empty
3981        log.debug("Check if not empty")
3982        sql_query_chromosomes = (
3983            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3984        )
3985        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3986        if not sql_query_chromosomes_df["count"][0]:
3987            log.info(f"VCF empty")
3988            return
3989
3990        # Export in VCF
3991        log.debug("Create initial file to annotate")
3992        tmp_vcf = NamedTemporaryFile(
3993            prefix=self.get_prefix(),
3994            dir=self.get_tmp_dir(),
3995            suffix=".vcf.gz",
3996            delete=False,
3997        )
3998        tmp_vcf_name = tmp_vcf.name
3999
4000        # VCF header
4001        vcf_reader = self.get_header()
4002        log.debug("Initial header: " + str(vcf_reader.infos))
4003
4004        # Existing annotations
4005        for vcf_annotation in self.get_header().infos:
4006
4007            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4008            log.debug(
4009                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4010            )
4011
4012        if annotations:
4013
4014            tmp_ann_vcf_list = []
4015            commands = []
4016            tmp_files = []
4017            err_files = []
4018
4019            for annotation in annotations:
4020                annotation_fields = annotations[annotation]
4021
4022                # Annotation Name
4023                annotation_name = os.path.basename(annotation)
4024
4025                if not annotation_fields:
4026                    annotation_fields = {"INFO": None}
4027
4028                log.debug(f"Annotation '{annotation_name}'")
4029                log.debug(
4030                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
4031                )
4032
4033                # Create Database
4034                database = Database(
4035                    database=annotation,
4036                    databases_folders=databases_folders,
4037                    assembly=assembly,
4038                )
4039
4040                # Find files
4041                db_file = database.get_database()
4042                db_file = full_path(db_file)
4043                db_hdr_file = database.get_header_file()
4044                db_hdr_file = full_path(db_hdr_file)
4045                db_file_type = database.get_format()
4046                db_tbi_file = f"{db_file}.tbi"
4047                db_file_compressed = database.is_compressed()
4048
4049                # Check if compressed
4050                if not db_file_compressed:
4051                    log.error(
4052                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4053                    )
4054                    raise ValueError(
4055                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4056                    )
4057
4058                # Check if indexed
4059                if not os.path.exists(db_tbi_file):
4060                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
4061                    raise ValueError(
4062                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
4063                    )
4064
4065                # Check index - try to create if not exists
4066                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
4067                    log.error("Annotation failed: database not valid")
4068                    log.error(f"Annotation annotation file: {db_file}")
4069                    log.error(f"Annotation annotation header: {db_hdr_file}")
4070                    log.error(f"Annotation annotation index: {db_tbi_file}")
4071                    raise ValueError(
4072                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
4073                    )
4074                else:
4075
4076                    log.debug(
4077                        f"Annotation '{annotation}' - file: "
4078                        + str(db_file)
4079                        + " and "
4080                        + str(db_hdr_file)
4081                    )
4082
4083                    # Load header as VCF object
4084                    db_hdr_vcf = Variants(input=db_hdr_file)
4085                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
4086                    log.debug(
4087                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
4088                    )
4089
4090                    # For all fields in database
4091                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
4092                        annotation_fields = {
4093                            key: key for key in db_hdr_vcf_header_infos
4094                        }
4095                        log.debug(
4096                            "Annotation database header - All annotations added: "
4097                            + str(annotation_fields)
4098                        )
4099
4100                    # Number of fields
4101                    nb_annotation_field = 0
4102                    annotation_list = []
4103
4104                    for annotation_field in annotation_fields:
4105
4106                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
4107                        annotation_fields_new_name = annotation_fields.get(
4108                            annotation_field, annotation_field
4109                        )
4110                        if not annotation_fields_new_name:
4111                            annotation_fields_new_name = annotation_field
4112
4113                        # Check if field is in DB and if field is not elready in input data
4114                        if (
4115                            annotation_field in db_hdr_vcf.get_header().infos
4116                            and annotation_fields_new_name
4117                            not in self.get_header().infos
4118                        ):
4119
4120                            log.info(
4121                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
4122                            )
4123
4124                            # Add INFO field to header
4125                            db_hdr_vcf_header_infos_number = (
4126                                db_hdr_vcf_header_infos[annotation_field].num or "."
4127                            )
4128                            db_hdr_vcf_header_infos_type = (
4129                                db_hdr_vcf_header_infos[annotation_field].type
4130                                or "String"
4131                            )
4132                            db_hdr_vcf_header_infos_description = (
4133                                db_hdr_vcf_header_infos[annotation_field].desc
4134                                or f"{annotation_field} description"
4135                            )
4136                            db_hdr_vcf_header_infos_source = (
4137                                db_hdr_vcf_header_infos[annotation_field].source
4138                                or "unknown"
4139                            )
4140                            db_hdr_vcf_header_infos_version = (
4141                                db_hdr_vcf_header_infos[annotation_field].version
4142                                or "unknown"
4143                            )
4144
4145                            vcf_reader.infos[annotation_fields_new_name] = (
4146                                vcf.parser._Info(
4147                                    annotation_fields_new_name,
4148                                    db_hdr_vcf_header_infos_number,
4149                                    db_hdr_vcf_header_infos_type,
4150                                    db_hdr_vcf_header_infos_description,
4151                                    db_hdr_vcf_header_infos_source,
4152                                    db_hdr_vcf_header_infos_version,
4153                                    self.code_type_map[db_hdr_vcf_header_infos_type],
4154                                )
4155                            )
4156
4157                            # annotation_list.append(annotation_field)
4158                            if annotation_field != annotation_fields_new_name:
4159                                annotation_list.append(
4160                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
4161                                )
4162                            else:
4163                                annotation_list.append(annotation_field)
4164
4165                            nb_annotation_field += 1
4166
4167                        else:
4168
4169                            if annotation_field not in db_hdr_vcf.get_header().infos:
4170                                log.warning(
4171                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
4172                                )
4173                            if annotation_fields_new_name in self.get_header().infos:
4174                                log.warning(
4175                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
4176                                )
4177
4178                    log.info(
4179                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
4180                    )
4181
4182                    annotation_infos = ",".join(annotation_list)
4183
4184                    if annotation_infos != "":
4185
4186                        # Protect header for bcftools (remove "#CHROM" and variants line)
4187                        log.debug("Protect Header file - remove #CHROM line if exists")
4188                        tmp_header_vcf = NamedTemporaryFile(
4189                            prefix=self.get_prefix(),
4190                            dir=self.get_tmp_dir(),
4191                            suffix=".hdr",
4192                            delete=False,
4193                        )
4194                        tmp_header_vcf_name = tmp_header_vcf.name
4195                        tmp_files.append(tmp_header_vcf_name)
4196                        # Command
4197                        if db_hdr_file.endswith(".gz"):
4198                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4199                        else:
4200                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4201                        # Run
4202                        run_parallel_commands([command_extract_header], 1)
4203
4204                        # Find chomosomes
4205                        log.debug("Find chromosomes ")
4206                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
4207                        sql_query_chromosomes_df = self.get_query_to_df(
4208                            sql_query_chromosomes
4209                        )
4210                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
4211
4212                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
4213
4214                        # BED columns in the annotation file
4215                        if db_file_type in ["bed"]:
4216                            annotation_infos = "CHROM,POS,POS," + annotation_infos
4217
4218                        for chrom in chomosomes_list:
4219
4220                            # Create BED on initial VCF
4221                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
4222                            tmp_bed = NamedTemporaryFile(
4223                                prefix=self.get_prefix(),
4224                                dir=self.get_tmp_dir(),
4225                                suffix=".bed",
4226                                delete=False,
4227                            )
4228                            tmp_bed_name = tmp_bed.name
4229                            tmp_files.append(tmp_bed_name)
4230
4231                            # Detecte regions
4232                            log.debug(
4233                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
4234                            )
4235                            window = 1000000
4236                            sql_query_intervals_for_bed = f"""
4237                                SELECT  \"#CHROM\",
4238                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
4239                                        \"POS\"+{window}
4240                                FROM {table_variants} as table_variants
4241                                WHERE table_variants.\"#CHROM\" = '{chrom}'
4242                            """
4243                            regions = self.conn.execute(
4244                                sql_query_intervals_for_bed
4245                            ).fetchall()
4246                            merged_regions = merge_regions(regions)
4247                            log.debug(
4248                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
4249                            )
4250
4251                            header = ["#CHROM", "START", "END"]
4252                            with open(tmp_bed_name, "w") as f:
4253                                # Write the header with tab delimiter
4254                                f.write("\t".join(header) + "\n")
4255                                for d in merged_regions:
4256                                    # Write each data row with tab delimiter
4257                                    f.write("\t".join(map(str, d)) + "\n")
4258
4259                            # Tmp files
4260                            tmp_annotation_vcf = NamedTemporaryFile(
4261                                prefix=self.get_prefix(),
4262                                dir=self.get_tmp_dir(),
4263                                suffix=".vcf.gz",
4264                                delete=False,
4265                            )
4266                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
4267                            tmp_files.append(tmp_annotation_vcf_name)
4268                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
4269                            tmp_annotation_vcf_name_err = (
4270                                tmp_annotation_vcf_name + ".err"
4271                            )
4272                            err_files.append(tmp_annotation_vcf_name_err)
4273
4274                            # Annotate Command
4275                            log.debug(
4276                                f"Annotation '{annotation}' - add bcftools command"
4277                            )
4278
4279                            # Command
4280                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
4281
4282                            # Add command
4283                            commands.append(command_annotate)
4284
4285            # if some commands
4286            if commands:
4287
4288                # Export VCF file
4289                self.export_variant_vcf(
4290                    vcf_file=tmp_vcf_name,
4291                    remove_info=True,
4292                    add_samples=False,
4293                    index=True,
4294                )
4295
4296                # Threads
4297                # calculate threads for annotated commands
4298                if commands:
4299                    threads_bcftools_annotate = round(threads / len(commands))
4300                else:
4301                    threads_bcftools_annotate = 1
4302
4303                if not threads_bcftools_annotate:
4304                    threads_bcftools_annotate = 1
4305
4306                # Add threads option to bcftools commands
4307                if threads_bcftools_annotate > 1:
4308                    commands_threaded = []
4309                    for command in commands:
4310                        commands_threaded.append(
4311                            command.replace(
4312                                f"{bcftools_bin_command} annotate ",
4313                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
4314                            )
4315                        )
4316                    commands = commands_threaded
4317
4318                # Command annotation multithreading
4319                log.debug(f"Annotation - Annotation commands: " + str(commands))
4320                log.info(
4321                    f"Annotation - Annotation multithreaded in "
4322                    + str(len(commands))
4323                    + " commands"
4324                )
4325
4326                run_parallel_commands(commands, threads)
4327
4328                # Merge
4329                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
4330
4331                if tmp_ann_vcf_list_cmd:
4332
4333                    # Tmp file
4334                    tmp_annotate_vcf = NamedTemporaryFile(
4335                        prefix=self.get_prefix(),
4336                        dir=self.get_tmp_dir(),
4337                        suffix=".vcf.gz",
4338                        delete=True,
4339                    )
4340                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
4341                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4342                    err_files.append(tmp_annotate_vcf_name_err)
4343
4344                    # Tmp file remove command
4345                    tmp_files_remove_command = ""
4346                    if tmp_files:
4347                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
4348
4349                    # Command merge
4350                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
4351                    log.info(
4352                        f"Annotation - Annotation merging "
4353                        + str(len(commands))
4354                        + " annotated files"
4355                    )
4356                    log.debug(f"Annotation - merge command: {merge_command}")
4357                    run_parallel_commands([merge_command], 1)
4358
4359                    # Error messages
4360                    log.info(f"Error/Warning messages:")
4361                    error_message_command_all = []
4362                    error_message_command_warning = []
4363                    error_message_command_err = []
4364                    for err_file in err_files:
4365                        with open(err_file, "r") as f:
4366                            for line in f:
4367                                message = line.strip()
4368                                error_message_command_all.append(message)
4369                                if line.startswith("[W::"):
4370                                    error_message_command_warning.append(message)
4371                                if line.startswith("[E::"):
4372                                    error_message_command_err.append(
4373                                        f"{err_file}: " + message
4374                                    )
4375                    # log info
4376                    for message in list(
4377                        set(error_message_command_err + error_message_command_warning)
4378                    ):
4379                        log.info(f"   {message}")
4380                    # debug info
4381                    for message in list(set(error_message_command_all)):
4382                        log.debug(f"   {message}")
4383                    # failed
4384                    if len(error_message_command_err):
4385                        log.error("Annotation failed: Error in commands")
4386                        raise ValueError("Annotation failed: Error in commands")
4387
4388                    # Update variants
4389                    log.info(f"Annotation - Updating...")
4390                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
4392    def annotation_exomiser(self, threads: int = None) -> None:
4393        """
4394        This function annotate with Exomiser
4395
4396        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
4397        - "analysis" (dict/file):
4398            Full analysis dictionnary parameters (see Exomiser docs).
4399            Either a dict, or a file in JSON or YAML format.
4400            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
4401            Default : None
4402        - "preset" (string):
4403            Analysis preset (available in config folder).
4404            Used if no full "analysis" is provided.
4405            Default: "exome"
4406        - "phenopacket" (dict/file):
4407            Samples and phenotipic features parameters (see Exomiser docs).
4408            Either a dict, or a file in JSON or YAML format.
4409            Default: None
4410        - "subject" (dict):
4411            Sample parameters (see Exomiser docs).
4412            Example:
4413                "subject":
4414                    {
4415                        "id": "ISDBM322017",
4416                        "sex": "FEMALE"
4417                    }
4418            Default: None
4419        - "sample" (string):
4420            Sample name to construct "subject" section:
4421                "subject":
4422                    {
4423                        "id": "<sample>",
4424                        "sex": "UNKNOWN_SEX"
4425                    }
4426            Default: None
4427        - "phenotypicFeatures" (dict)
4428            Phenotypic features to construct "subject" section.
4429            Example:
4430                "phenotypicFeatures":
4431                    [
4432                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
4433                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
4434                    ]
4435        - "hpo" (list)
4436            List of HPO ids as phenotypic features.
4437            Example:
4438                "hpo": ['0001156', '0001363', '0011304', '0010055']
4439            Default: []
4440        - "outputOptions" (dict):
4441            Output options (see Exomiser docs).
4442            Default:
4443                "output_options" =
4444                    {
4445                        "outputContributingVariantsOnly": False,
4446                        "numGenes": 0,
4447                        "outputFormats": ["TSV_VARIANT", "VCF"]
4448                    }
4449        - "transcript_source" (string):
4450            Transcript source (either "refseq", "ucsc", "ensembl")
4451            Default: "refseq"
4452        - "exomiser_to_info" (boolean):
4453            Add exomiser TSV file columns as INFO fields in VCF.
4454            Default: False
4455        - "release" (string):
4456            Exomise database release.
4457            If not exists, database release will be downloaded (take a while).
4458            Default: None (provided by application.properties configuration file)
4459        - "exomiser_application_properties" (file):
4460            Exomiser configuration file (see Exomiser docs).
4461            Useful to automatically download databases (especially for specific genome databases).
4462
4463        Notes:
4464        - If no sample in parameters, first sample in VCF will be chosen
4465        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4466
4467        :param threads: The number of threads to use
4468        :return: None.
4469        """
4470
4471        # DEBUG
4472        log.debug("Start annotation with Exomiser databases")
4473
4474        # Threads
4475        if not threads:
4476            threads = self.get_threads()
4477        log.debug("Threads: " + str(threads))
4478
4479        # Config
4480        config = self.get_config()
4481        log.debug("Config: " + str(config))
4482
4483        # Config - Folders - Databases
4484        databases_folders = (
4485            config.get("folders", {})
4486            .get("databases", {})
4487            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4488        )
4489        databases_folders = full_path(databases_folders)
4490        if not os.path.exists(databases_folders):
4491            log.error(f"Databases annotations: {databases_folders} NOT found")
4492        log.debug("Databases annotations: " + str(databases_folders))
4493
4494        # Config - Exomiser
4495        exomiser_bin_command = get_bin_command(
4496            bin="exomiser-cli*.jar",
4497            tool="exomiser",
4498            bin_type="jar",
4499            config=config,
4500            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4501        )
4502        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4503        if not exomiser_bin_command:
4504            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4505            log.error(msg_err)
4506            raise ValueError(msg_err)
4507
4508        # Param
4509        param = self.get_param()
4510        log.debug("Param: " + str(param))
4511
4512        # Param - Exomiser
4513        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4514        log.debug(f"Param Exomiser: {param_exomiser}")
4515
4516        # Param - Assembly
4517        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4518        log.debug("Assembly: " + str(assembly))
4519
4520        # Data
4521        table_variants = self.get_table_variants()
4522
4523        # Check if not empty
4524        log.debug("Check if not empty")
4525        sql_query_chromosomes = (
4526            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4527        )
4528        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4529            log.info(f"VCF empty")
4530            return False
4531
4532        # VCF header
4533        vcf_reader = self.get_header()
4534        log.debug("Initial header: " + str(vcf_reader.infos))
4535
4536        # Samples
4537        samples = self.get_header_sample_list()
4538        if not samples:
4539            log.error("No Samples in VCF")
4540            return False
4541        log.debug(f"Samples: {samples}")
4542
4543        # Memory limit
4544        memory_limit = self.get_memory("8G")
4545        log.debug(f"memory_limit: {memory_limit}")
4546
4547        # Exomiser java options
4548        exomiser_java_options = (
4549            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4550        )
4551        log.debug(f"Exomiser java options: {exomiser_java_options}")
4552
4553        # Download Exomiser (if not exists)
4554        exomiser_release = param_exomiser.get("release", None)
4555        exomiser_application_properties = param_exomiser.get(
4556            "exomiser_application_properties", None
4557        )
4558        databases_download_exomiser(
4559            assemblies=[assembly],
4560            exomiser_folder=databases_folders,
4561            exomiser_release=exomiser_release,
4562            exomiser_phenotype_release=exomiser_release,
4563            exomiser_application_properties=exomiser_application_properties,
4564        )
4565
4566        # Force annotation
4567        force_update_annotation = True
4568
4569        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4570            log.debug("Start annotation Exomiser")
4571
4572            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4573
4574                # tmp_dir = "/tmp/exomiser"
4575
4576                ### ANALYSIS ###
4577                ################
4578
4579                # Create analysis.json through analysis dict
4580                # either analysis in param or by default
4581                # depending on preset exome/genome)
4582
4583                # Init analysis dict
4584                param_exomiser_analysis_dict = {}
4585
4586                # analysis from param
4587                param_exomiser_analysis = param_exomiser.get("analysis", {})
4588                param_exomiser_analysis = full_path(param_exomiser_analysis)
4589
4590                # If analysis in param -> load anlaysis json
4591                if param_exomiser_analysis:
4592
4593                    # If param analysis is a file and exists
4594                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4595                        param_exomiser_analysis
4596                    ):
4597                        # Load analysis file into analysis dict (either yaml or json)
4598                        with open(param_exomiser_analysis) as json_file:
4599                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4600
4601                    # If param analysis is a dict
4602                    elif isinstance(param_exomiser_analysis, dict):
4603                        # Load analysis dict into analysis dict (either yaml or json)
4604                        param_exomiser_analysis_dict = param_exomiser_analysis
4605
4606                    # Error analysis type
4607                    else:
4608                        log.error(f"Analysis type unknown. Check param file.")
4609                        raise ValueError(f"Analysis type unknown. Check param file.")
4610
4611                # Case no input analysis config file/dict
4612                # Use preset (exome/genome) to open default config file
4613                if not param_exomiser_analysis_dict:
4614
4615                    # default preset
4616                    default_preset = "exome"
4617
4618                    # Get param preset or default preset
4619                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4620
4621                    # Try to find if preset is a file
4622                    if os.path.exists(param_exomiser_preset):
4623                        # Preset file is provided in full path
4624                        param_exomiser_analysis_default_config_file = (
4625                            param_exomiser_preset
4626                        )
4627                    # elif os.path.exists(full_path(param_exomiser_preset)):
4628                    #     # Preset file is provided in full path
4629                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4630                    elif os.path.exists(
4631                        os.path.join(folder_config, param_exomiser_preset)
4632                    ):
4633                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4634                        param_exomiser_analysis_default_config_file = os.path.join(
4635                            folder_config, param_exomiser_preset
4636                        )
4637                    else:
4638                        # Construct preset file
4639                        param_exomiser_analysis_default_config_file = os.path.join(
4640                            folder_config,
4641                            f"preset-{param_exomiser_preset}-analysis.json",
4642                        )
4643
4644                    # If preset file exists
4645                    param_exomiser_analysis_default_config_file = full_path(
4646                        param_exomiser_analysis_default_config_file
4647                    )
4648                    if os.path.exists(param_exomiser_analysis_default_config_file):
4649                        # Load prest file into analysis dict (either yaml or json)
4650                        with open(
4651                            param_exomiser_analysis_default_config_file
4652                        ) as json_file:
4653                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4654                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4655                                json_file
4656                            )
4657
4658                    # Error preset file
4659                    else:
4660                        log.error(
4661                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4662                        )
4663                        raise ValueError(
4664                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4665                        )
4666
4667                # If no analysis dict created
4668                if not param_exomiser_analysis_dict:
4669                    log.error(f"No analysis config")
4670                    raise ValueError(f"No analysis config")
4671
4672                # Log
4673                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4674
4675                ### PHENOPACKET ###
4676                ###################
4677
4678                # If no PhenoPacket in analysis dict -> check in param
4679                if "phenopacket" not in param_exomiser_analysis_dict:
4680
4681                    # If PhenoPacket in param -> load anlaysis json
4682                    if param_exomiser.get("phenopacket", None):
4683
4684                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4685                        param_exomiser_phenopacket = full_path(
4686                            param_exomiser_phenopacket
4687                        )
4688
4689                        # If param phenopacket is a file and exists
4690                        if isinstance(
4691                            param_exomiser_phenopacket, str
4692                        ) and os.path.exists(param_exomiser_phenopacket):
4693                            # Load phenopacket file into analysis dict (either yaml or json)
4694                            with open(param_exomiser_phenopacket) as json_file:
4695                                param_exomiser_analysis_dict["phenopacket"] = (
4696                                    yaml.safe_load(json_file)
4697                                )
4698
4699                        # If param phenopacket is a dict
4700                        elif isinstance(param_exomiser_phenopacket, dict):
4701                            # Load phenopacket dict into analysis dict (either yaml or json)
4702                            param_exomiser_analysis_dict["phenopacket"] = (
4703                                param_exomiser_phenopacket
4704                            )
4705
4706                        # Error phenopacket type
4707                        else:
4708                            log.error(f"Phenopacket type unknown. Check param file.")
4709                            raise ValueError(
4710                                f"Phenopacket type unknown. Check param file."
4711                            )
4712
4713                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4714                if "phenopacket" not in param_exomiser_analysis_dict:
4715
4716                    # Init PhenoPacket
4717                    param_exomiser_analysis_dict["phenopacket"] = {
4718                        "id": "analysis",
4719                        "proband": {},
4720                    }
4721
4722                    ### Add subject ###
4723
4724                    # If subject exists
4725                    param_exomiser_subject = param_exomiser.get("subject", {})
4726
4727                    # If subject not exists -> found sample ID
4728                    if not param_exomiser_subject:
4729
4730                        # Found sample ID in param
4731                        sample = param_exomiser.get("sample", None)
4732
4733                        # Find sample ID (first sample)
4734                        if not sample:
4735                            sample_list = self.get_header_sample_list()
4736                            if len(sample_list) > 0:
4737                                sample = sample_list[0]
4738                            else:
4739                                log.error(f"No sample found")
4740                                raise ValueError(f"No sample found")
4741
4742                        # Create subject
4743                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4744
4745                    # Add to dict
4746                    param_exomiser_analysis_dict["phenopacket"][
4747                        "subject"
4748                    ] = param_exomiser_subject
4749
4750                    ### Add "phenotypicFeatures" ###
4751
4752                    # If phenotypicFeatures exists
4753                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4754                        "phenotypicFeatures", []
4755                    )
4756
4757                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4758                    if not param_exomiser_phenotypicfeatures:
4759
4760                        # Found HPO in param
4761                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4762
4763                        # Split HPO if list in string format separated by comma
4764                        if isinstance(param_exomiser_hpo, str):
4765                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4766
4767                        # Create HPO list
4768                        for hpo in param_exomiser_hpo:
4769                            hpo_clean = re.sub("[^0-9]", "", hpo)
4770                            param_exomiser_phenotypicfeatures.append(
4771                                {
4772                                    "type": {
4773                                        "id": f"HP:{hpo_clean}",
4774                                        "label": f"HP:{hpo_clean}",
4775                                    }
4776                                }
4777                            )
4778
4779                    # Add to dict
4780                    param_exomiser_analysis_dict["phenopacket"][
4781                        "phenotypicFeatures"
4782                    ] = param_exomiser_phenotypicfeatures
4783
4784                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4785                    if not param_exomiser_phenotypicfeatures:
4786                        for step in param_exomiser_analysis_dict.get(
4787                            "analysis", {}
4788                        ).get("steps", []):
4789                            if "hiPhivePrioritiser" in step:
4790                                param_exomiser_analysis_dict.get("analysis", {}).get(
4791                                    "steps", []
4792                                ).remove(step)
4793
4794                ### Add Input File ###
4795
4796                # Initial file name and htsFiles
4797                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4798                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4799                    {
4800                        "uri": tmp_vcf_name,
4801                        "htsFormat": "VCF",
4802                        "genomeAssembly": assembly,
4803                    }
4804                ]
4805
4806                ### Add metaData ###
4807
4808                # If metaData not in analysis dict
4809                if "metaData" not in param_exomiser_analysis_dict:
4810                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4811                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4812                        "createdBy": "howard",
4813                        "phenopacketSchemaVersion": 1,
4814                    }
4815
4816                ### OutputOptions ###
4817
4818                # Init output result folder
4819                output_results = os.path.join(tmp_dir, "results")
4820
4821                # If no outputOptions in analysis dict
4822                if "outputOptions" not in param_exomiser_analysis_dict:
4823
4824                    # default output formats
4825                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4826
4827                    # Get outputOptions in param
4828                    output_options = param_exomiser.get("outputOptions", None)
4829
4830                    # If no output_options in param -> check
4831                    if not output_options:
4832                        output_options = {
4833                            "outputContributingVariantsOnly": False,
4834                            "numGenes": 0,
4835                            "outputFormats": defaut_output_formats,
4836                        }
4837
4838                    # Replace outputDirectory in output options
4839                    output_options["outputDirectory"] = output_results
4840                    output_options["outputFileName"] = "howard"
4841
4842                    # Add outputOptions in analysis dict
4843                    param_exomiser_analysis_dict["outputOptions"] = output_options
4844
4845                else:
4846
4847                    # Replace output_results and output format (if exists in param)
4848                    param_exomiser_analysis_dict["outputOptions"][
4849                        "outputDirectory"
4850                    ] = output_results
4851                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4852                        list(
4853                            set(
4854                                param_exomiser_analysis_dict.get(
4855                                    "outputOptions", {}
4856                                ).get("outputFormats", [])
4857                                + ["TSV_VARIANT", "VCF"]
4858                            )
4859                        )
4860                    )
4861
4862                # log
4863                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4864
4865                ### ANALYSIS FILE ###
4866                #####################
4867
4868                ### Full JSON analysis config file ###
4869
4870                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4871                with open(exomiser_analysis, "w") as fp:
4872                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4873
4874                ### SPLIT analysis and sample config files
4875
4876                # Splitted analysis dict
4877                param_exomiser_analysis_dict_for_split = (
4878                    param_exomiser_analysis_dict.copy()
4879                )
4880
4881                # Phenopacket JSON file
4882                exomiser_analysis_phenopacket = os.path.join(
4883                    tmp_dir, "analysis_phenopacket.json"
4884                )
4885                with open(exomiser_analysis_phenopacket, "w") as fp:
4886                    json.dump(
4887                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4888                        fp,
4889                        indent=4,
4890                    )
4891
4892                # Analysis JSON file without Phenopacket parameters
4893                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4894                exomiser_analysis_analysis = os.path.join(
4895                    tmp_dir, "analysis_analysis.json"
4896                )
4897                with open(exomiser_analysis_analysis, "w") as fp:
4898                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4899
4900                ### INITAL VCF file ###
4901                #######################
4902
4903                ### Create list of samples to use and include inti initial VCF file ####
4904
4905                # Subject (main sample)
4906                # Get sample ID in analysis dict
4907                sample_subject = (
4908                    param_exomiser_analysis_dict.get("phenopacket", {})
4909                    .get("subject", {})
4910                    .get("id", None)
4911                )
4912                sample_proband = (
4913                    param_exomiser_analysis_dict.get("phenopacket", {})
4914                    .get("proband", {})
4915                    .get("subject", {})
4916                    .get("id", None)
4917                )
4918                sample = []
4919                if sample_subject:
4920                    sample.append(sample_subject)
4921                if sample_proband:
4922                    sample.append(sample_proband)
4923
4924                # Get sample ID within Pedigree
4925                pedigree_persons_list = (
4926                    param_exomiser_analysis_dict.get("phenopacket", {})
4927                    .get("pedigree", {})
4928                    .get("persons", {})
4929                )
4930
4931                # Create list with all sample ID in pedigree (if exists)
4932                pedigree_persons = []
4933                for person in pedigree_persons_list:
4934                    pedigree_persons.append(person.get("individualId"))
4935
4936                # Concat subject sample ID and samples ID in pedigreesamples
4937                samples = list(set(sample + pedigree_persons))
4938
4939                # Check if sample list is not empty
4940                if not samples:
4941                    log.error(f"No samples found")
4942                    raise ValueError(f"No samples found")
4943
4944                # Create VCF with sample (either sample in param or first one by default)
4945                # Export VCF file
4946                self.export_variant_vcf(
4947                    vcf_file=tmp_vcf_name,
4948                    remove_info=True,
4949                    add_samples=True,
4950                    list_samples=samples,
4951                    index=False,
4952                )
4953
4954                ### Execute Exomiser ###
4955                ########################
4956
4957                # Init command
4958                exomiser_command = ""
4959
4960                # Command exomiser options
4961                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4962
4963                # Release
4964                exomiser_release = param_exomiser.get("release", None)
4965                if exomiser_release:
4966                    # phenotype data version
4967                    exomiser_options += (
4968                        f" --exomiser.phenotype.data-version={exomiser_release} "
4969                    )
4970                    # data version
4971                    exomiser_options += (
4972                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4973                    )
4974                    # variant white list
4975                    variant_white_list_file = (
4976                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4977                    )
4978                    if os.path.exists(
4979                        os.path.join(
4980                            databases_folders, assembly, variant_white_list_file
4981                        )
4982                    ):
4983                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4984
4985                # transcript_source
4986                transcript_source = param_exomiser.get(
4987                    "transcript_source", None
4988                )  # ucsc, refseq, ensembl
4989                if transcript_source:
4990                    exomiser_options += (
4991                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4992                    )
4993
4994                # If analysis contain proband param
4995                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4996                    "proband", {}
4997                ):
4998                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4999
5000                # If no proband (usually uniq sample)
5001                else:
5002                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
5003
5004                # Log
5005                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
5006
5007                # Run command
5008                result = subprocess.call(
5009                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
5010                )
5011                if result:
5012                    log.error("Exomiser command failed")
5013                    raise ValueError("Exomiser command failed")
5014
5015                ### RESULTS ###
5016                ###############
5017
5018                ### Annotate with TSV fields ###
5019
5020                # Init result tsv file
5021                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
5022
5023                # Init result tsv file
5024                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
5025
5026                # Parse TSV file and explode columns in INFO field
5027                if exomiser_to_info and os.path.exists(output_results_tsv):
5028
5029                    # Log
5030                    log.debug("Exomiser columns to VCF INFO field")
5031
5032                    # Retrieve columns and types
5033                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
5034                    output_results_tsv_df = self.get_query_to_df(query)
5035                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
5036
5037                    # Init concat fields for update
5038                    sql_query_update_concat_fields = []
5039
5040                    # Fields to avoid
5041                    fields_to_avoid = [
5042                        "CONTIG",
5043                        "START",
5044                        "END",
5045                        "REF",
5046                        "ALT",
5047                        "QUAL",
5048                        "FILTER",
5049                        "GENOTYPE",
5050                    ]
5051
5052                    # List all columns to add into header
5053                    for header_column in output_results_tsv_columns:
5054
5055                        # If header column is enable
5056                        if header_column not in fields_to_avoid:
5057
5058                            # Header info type
5059                            header_info_type = "String"
5060                            header_column_df = output_results_tsv_df[header_column]
5061                            header_column_df_dtype = header_column_df.dtype
5062                            if header_column_df_dtype == object:
5063                                if (
5064                                    pd.to_numeric(header_column_df, errors="coerce")
5065                                    .notnull()
5066                                    .all()
5067                                ):
5068                                    header_info_type = "Float"
5069                            else:
5070                                header_info_type = "Integer"
5071
5072                            # Header info
5073                            characters_to_validate = ["-"]
5074                            pattern = "[" + "".join(characters_to_validate) + "]"
5075                            header_info_name = re.sub(
5076                                pattern,
5077                                "_",
5078                                f"Exomiser_{header_column}".replace("#", ""),
5079                            )
5080                            header_info_number = "."
5081                            header_info_description = (
5082                                f"Exomiser {header_column} annotation"
5083                            )
5084                            header_info_source = "Exomiser"
5085                            header_info_version = "unknown"
5086                            header_info_code = CODE_TYPE_MAP[header_info_type]
5087                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
5088                                header_info_name,
5089                                header_info_number,
5090                                header_info_type,
5091                                header_info_description,
5092                                header_info_source,
5093                                header_info_version,
5094                                header_info_code,
5095                            )
5096
5097                            # Add field to add for update to concat fields
5098                            sql_query_update_concat_fields.append(
5099                                f"""
5100                                CASE
5101                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
5102                                    THEN concat(
5103                                        '{header_info_name}=',
5104                                        table_parquet."{header_column}",
5105                                        ';'
5106                                        )
5107
5108                                    ELSE ''
5109                                END
5110                            """
5111                            )
5112
5113                    # Update query
5114                    sql_query_update = f"""
5115                        UPDATE {table_variants} as table_variants
5116                            SET INFO = concat(
5117                                            CASE
5118                                                WHEN INFO NOT IN ('', '.')
5119                                                THEN INFO
5120                                                ELSE ''
5121                                            END,
5122                                            CASE
5123                                                WHEN table_variants.INFO NOT IN ('','.')
5124                                                THEN ';'
5125                                                ELSE ''
5126                                            END,
5127                                            (
5128                                            SELECT 
5129                                                concat(
5130                                                    {",".join(sql_query_update_concat_fields)}
5131                                                )
5132                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
5133                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
5134                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
5135                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5136                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5137                                            )
5138                                        )
5139                            ;
5140                        """
5141
5142                    # Update
5143                    self.conn.execute(sql_query_update)
5144
5145                ### Annotate with VCF INFO field ###
5146
5147                # Init result VCF file
5148                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
5149
5150                # If VCF exists
5151                if os.path.exists(output_results_vcf):
5152
5153                    # Log
5154                    log.debug("Exomiser result VCF update variants")
5155
5156                    # Find Exomiser INFO field annotation in header
5157                    with gzip.open(output_results_vcf, "rt") as f:
5158                        header_list = self.read_vcf_header(f)
5159                    exomiser_vcf_header = vcf.Reader(
5160                        io.StringIO("\n".join(header_list))
5161                    )
5162
5163                    # Add annotation INFO field to header
5164                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
5165
5166                    # Update variants with VCF
5167                    self.update_from_vcf(output_results_vcf)
5168
5169        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
5171    def annotation_snpeff(self, threads: int = None) -> None:
5172        """
5173        This function annotate with snpEff
5174
5175        :param threads: The number of threads to use
5176        :return: the value of the variable "return_value".
5177        """
5178
5179        # DEBUG
5180        log.debug("Start annotation with snpeff databases")
5181
5182        # Threads
5183        if not threads:
5184            threads = self.get_threads()
5185        log.debug("Threads: " + str(threads))
5186
5187        # DEBUG
5188        delete_tmp = True
5189        if self.get_config().get("verbosity", "warning") in ["debug"]:
5190            delete_tmp = False
5191            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5192
5193        # Config
5194        config = self.get_config()
5195        log.debug("Config: " + str(config))
5196
5197        # Config - Folders - Databases
5198        databases_folders = (
5199            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
5200        )
5201        log.debug("Databases annotations: " + str(databases_folders))
5202
5203        # Config - snpEff bin command
5204        snpeff_bin_command = get_bin_command(
5205            bin="snpEff.jar",
5206            tool="snpeff",
5207            bin_type="jar",
5208            config=config,
5209            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
5210        )
5211        if not snpeff_bin_command:
5212            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
5213            log.error(msg_err)
5214            raise ValueError(msg_err)
5215
5216        # Config - snpEff databases
5217        snpeff_databases = (
5218            config.get("folders", {})
5219            .get("databases", {})
5220            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
5221        )
5222        snpeff_databases = full_path(snpeff_databases)
5223        if snpeff_databases is not None and snpeff_databases != "":
5224            log.debug(f"Create snpEff databases folder")
5225            if not os.path.exists(snpeff_databases):
5226                os.makedirs(snpeff_databases)
5227
5228        # Param
5229        param = self.get_param()
5230        log.debug("Param: " + str(param))
5231
5232        # Param
5233        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
5234        log.debug("Options: " + str(options))
5235
5236        # Param - Assembly
5237        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5238
5239        # Param - Options
5240        snpeff_options = (
5241            param.get("annotation", {}).get("snpeff", {}).get("options", "")
5242        )
5243        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
5244        snpeff_csvstats = (
5245            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
5246        )
5247        if snpeff_stats:
5248            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
5249            snpeff_stats = full_path(snpeff_stats)
5250            snpeff_options += f" -stats {snpeff_stats}"
5251        if snpeff_csvstats:
5252            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
5253            snpeff_csvstats = full_path(snpeff_csvstats)
5254            snpeff_options += f" -csvStats {snpeff_csvstats}"
5255
5256        # Data
5257        table_variants = self.get_table_variants()
5258
5259        # Check if not empty
5260        log.debug("Check if not empty")
5261        sql_query_chromosomes = (
5262            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5263        )
5264        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
5265        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5266            log.info(f"VCF empty")
5267            return
5268
5269        # Export in VCF
5270        log.debug("Create initial file to annotate")
5271        tmp_vcf = NamedTemporaryFile(
5272            prefix=self.get_prefix(),
5273            dir=self.get_tmp_dir(),
5274            suffix=".vcf.gz",
5275            delete=True,
5276        )
5277        tmp_vcf_name = tmp_vcf.name
5278
5279        # VCF header
5280        vcf_reader = self.get_header()
5281        log.debug("Initial header: " + str(vcf_reader.infos))
5282
5283        # Existing annotations
5284        for vcf_annotation in self.get_header().infos:
5285
5286            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5287            log.debug(
5288                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5289            )
5290
5291        # Memory limit
5292        # if config.get("memory", None):
5293        #     memory_limit = config.get("memory", "8G")
5294        # else:
5295        #     memory_limit = "8G"
5296        memory_limit = self.get_memory("8G")
5297        log.debug(f"memory_limit: {memory_limit}")
5298
5299        # snpEff java options
5300        snpeff_java_options = (
5301            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
5302        )
5303        log.debug(f"Exomiser java options: {snpeff_java_options}")
5304
5305        force_update_annotation = True
5306
5307        if "ANN" not in self.get_header().infos or force_update_annotation:
5308
5309            # Check snpEff database
5310            log.debug(f"Check snpEff databases {[assembly]}")
5311            databases_download_snpeff(
5312                folder=snpeff_databases, assemblies=[assembly], config=config
5313            )
5314
5315            # Export VCF file
5316            self.export_variant_vcf(
5317                vcf_file=tmp_vcf_name,
5318                remove_info=True,
5319                add_samples=False,
5320                index=True,
5321            )
5322
5323            # Tmp file
5324            err_files = []
5325            tmp_annotate_vcf = NamedTemporaryFile(
5326                prefix=self.get_prefix(),
5327                dir=self.get_tmp_dir(),
5328                suffix=".vcf",
5329                delete=False,
5330            )
5331            tmp_annotate_vcf_name = tmp_annotate_vcf.name
5332            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5333            err_files.append(tmp_annotate_vcf_name_err)
5334
5335            # Command
5336            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
5337            log.debug(f"Annotation - snpEff command: {snpeff_command}")
5338            run_parallel_commands([snpeff_command], 1)
5339
5340            # Error messages
5341            log.info(f"Error/Warning messages:")
5342            error_message_command_all = []
5343            error_message_command_warning = []
5344            error_message_command_err = []
5345            for err_file in err_files:
5346                with open(err_file, "r") as f:
5347                    for line in f:
5348                        message = line.strip()
5349                        error_message_command_all.append(message)
5350                        if line.startswith("[W::"):
5351                            error_message_command_warning.append(message)
5352                        if line.startswith("[E::"):
5353                            error_message_command_err.append(f"{err_file}: " + message)
5354            # log info
5355            for message in list(
5356                set(error_message_command_err + error_message_command_warning)
5357            ):
5358                log.info(f"   {message}")
5359            # debug info
5360            for message in list(set(error_message_command_all)):
5361                log.debug(f"   {message}")
5362            # failed
5363            if len(error_message_command_err):
5364                log.error("Annotation failed: Error in commands")
5365                raise ValueError("Annotation failed: Error in commands")
5366
5367            # Find annotation in header
5368            with open(tmp_annotate_vcf_name, "rt") as f:
5369                header_list = self.read_vcf_header(f)
5370            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5371
5372            for ann in annovar_vcf_header.infos:
5373                if ann not in self.get_header().infos:
5374                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5375
5376            # Update variants
5377            log.info(f"Annotation - Updating...")
5378            self.update_from_vcf(tmp_annotate_vcf_name)
5379
5380        else:
5381            if "ANN" in self.get_header().infos:
5382                log.debug(f"Existing snpEff annotations in VCF")
5383            if force_update_annotation:
5384                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
5386    def annotation_annovar(self, threads: int = None) -> None:
5387        """
5388        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
5389        annotations
5390
5391        :param threads: number of threads to use
5392        :return: the value of the variable "return_value".
5393        """
5394
5395        # DEBUG
5396        log.debug("Start annotation with Annovar databases")
5397
5398        # Threads
5399        if not threads:
5400            threads = self.get_threads()
5401        log.debug("Threads: " + str(threads))
5402
5403        # Tmp en Err files
5404        tmp_files = []
5405        err_files = []
5406
5407        # DEBUG
5408        delete_tmp = True
5409        if self.get_config().get("verbosity", "warning") in ["debug"]:
5410            delete_tmp = False
5411            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5412
5413        # Config
5414        config = self.get_config()
5415        log.debug("Config: " + str(config))
5416
5417        # Config - Folders - Databases
5418        databases_folders = (
5419            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
5420        )
5421        log.debug("Databases annotations: " + str(databases_folders))
5422
5423        # Config - annovar bin command
5424        annovar_bin_command = get_bin_command(
5425            bin="table_annovar.pl",
5426            tool="annovar",
5427            bin_type="perl",
5428            config=config,
5429            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5430        )
5431        if not annovar_bin_command:
5432            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5433            log.error(msg_err)
5434            raise ValueError(msg_err)
5435
5436        # Config - BCFTools bin command
5437        bcftools_bin_command = get_bin_command(
5438            bin="bcftools",
5439            tool="bcftools",
5440            bin_type="bin",
5441            config=config,
5442            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5443        )
5444        if not bcftools_bin_command:
5445            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5446            log.error(msg_err)
5447            raise ValueError(msg_err)
5448
5449        # Config - annovar databases
5450        annovar_databases = (
5451            config.get("folders", {})
5452            .get("databases", {})
5453            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5454        )
5455        if annovar_databases is not None:
5456            if isinstance(annovar_databases, list):
5457                annovar_databases = full_path(annovar_databases[0])
5458                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
5459            annovar_databases = full_path(annovar_databases)
5460            if not os.path.exists(annovar_databases):
5461                log.info(f"Annovar databases folder '{annovar_databases}' created")
5462                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
5463        else:
5464            msg_err = f"Annovar databases configuration failed"
5465            log.error(msg_err)
5466            raise ValueError(msg_err)
5467
5468        # Param
5469        param = self.get_param()
5470        log.debug("Param: " + str(param))
5471
5472        # Param - options
5473        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5474        log.debug("Options: " + str(options))
5475
5476        # Param - annotations
5477        annotations = (
5478            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5479        )
5480        log.debug("Annotations: " + str(annotations))
5481
5482        # Param - Assembly
5483        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5484
5485        # Annovar database assembly
5486        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5487        if annovar_databases_assembly != "" and not os.path.exists(
5488            annovar_databases_assembly
5489        ):
5490            os.makedirs(annovar_databases_assembly)
5491
5492        # Data
5493        table_variants = self.get_table_variants()
5494
5495        # Check if not empty
5496        log.debug("Check if not empty")
5497        sql_query_chromosomes = (
5498            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5499        )
5500        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5501        if not sql_query_chromosomes_df["count"][0]:
5502            log.info(f"VCF empty")
5503            return
5504
5505        # VCF header
5506        vcf_reader = self.get_header()
5507        log.debug("Initial header: " + str(vcf_reader.infos))
5508
5509        # Existing annotations
5510        for vcf_annotation in self.get_header().infos:
5511
5512            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5513            log.debug(
5514                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5515            )
5516
5517        force_update_annotation = True
5518
5519        if annotations:
5520
5521            commands = []
5522            tmp_annotates_vcf_name_list = []
5523
5524            # Export in VCF
5525            log.debug("Create initial file to annotate")
5526            tmp_vcf = NamedTemporaryFile(
5527                prefix=self.get_prefix(),
5528                dir=self.get_tmp_dir(),
5529                suffix=".vcf.gz",
5530                delete=False,
5531            )
5532            tmp_vcf_name = tmp_vcf.name
5533            tmp_files.append(tmp_vcf_name)
5534            tmp_files.append(tmp_vcf_name + ".tbi")
5535
5536            # Export VCF file
5537            self.export_variant_vcf(
5538                vcf_file=tmp_vcf_name,
5539                remove_info=".",
5540                add_samples=False,
5541                index=True,
5542            )
5543
5544            # Create file for field rename
5545            log.debug("Create file for field rename")
5546            tmp_rename = NamedTemporaryFile(
5547                prefix=self.get_prefix(),
5548                dir=self.get_tmp_dir(),
5549                suffix=".rename",
5550                delete=False,
5551            )
5552            tmp_rename_name = tmp_rename.name
5553            tmp_files.append(tmp_rename_name)
5554
5555            # Check Annovar database
5556            log.debug(
5557                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5558            )
5559            databases_download_annovar(
5560                folder=annovar_databases,
5561                files=list(annotations.keys()),
5562                assemblies=[assembly],
5563            )
5564
5565            for annotation in annotations:
5566                annotation_fields = annotations[annotation]
5567
5568                if not annotation_fields:
5569                    annotation_fields = {"INFO": None}
5570
5571                log.info(f"Annotations Annovar - database '{annotation}'")
5572                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5573
5574                # Tmp file for annovar
5575                err_files = []
5576                tmp_annotate_vcf_directory = TemporaryDirectory(
5577                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5578                )
5579                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5580                tmp_annotate_vcf_name_annovar = (
5581                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5582                )
5583                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5584                err_files.append(tmp_annotate_vcf_name_err)
5585                tmp_files.append(tmp_annotate_vcf_name_err)
5586
5587                # Tmp file final vcf annotated by annovar
5588                tmp_annotate_vcf = NamedTemporaryFile(
5589                    prefix=self.get_prefix(),
5590                    dir=self.get_tmp_dir(),
5591                    suffix=".vcf.gz",
5592                    delete=False,
5593                )
5594                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5595                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5596                tmp_files.append(tmp_annotate_vcf_name)
5597                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5598
5599                # Number of fields
5600                annotation_list = []
5601                annotation_renamed_list = []
5602
5603                for annotation_field in annotation_fields:
5604
5605                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5606                    annotation_fields_new_name = annotation_fields.get(
5607                        annotation_field, annotation_field
5608                    )
5609                    if not annotation_fields_new_name:
5610                        annotation_fields_new_name = annotation_field
5611
5612                    if (
5613                        force_update_annotation
5614                        or annotation_fields_new_name not in self.get_header().infos
5615                    ):
5616                        annotation_list.append(annotation_field)
5617                        annotation_renamed_list.append(annotation_fields_new_name)
5618                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5619                        log.warning(
5620                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5621                        )
5622
5623                    # Add rename info
5624                    run_parallel_commands(
5625                        [
5626                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5627                        ],
5628                        1,
5629                    )
5630
5631                # log.debug("fields_to_removed: " + str(fields_to_removed))
5632                log.debug("annotation_list: " + str(annotation_list))
5633
5634                # protocol
5635                protocol = annotation
5636
5637                # argument
5638                argument = ""
5639
5640                # operation
5641                operation = "f"
5642                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5643                    "ensGene"
5644                ):
5645                    operation = "g"
5646                    if options.get("genebase", None):
5647                        argument = f"""'{options.get("genebase","")}'"""
5648                elif annotation in ["cytoBand"]:
5649                    operation = "r"
5650
5651                # argument option
5652                argument_option = ""
5653                if argument != "":
5654                    argument_option = " --argument " + argument
5655
5656                # command options
5657                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5658                for option in options:
5659                    if option not in ["genebase"]:
5660                        command_options += f""" --{option}={options[option]}"""
5661
5662                # Command
5663
5664                # Command - Annovar
5665                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5666                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5667
5668                # Command - start pipe
5669                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5670
5671                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5672                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5673
5674                # Command - Special characters (refGene annotation)
5675                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5676
5677                # Command - Clean empty fields (with value ".")
5678                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5679
5680                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5681                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5682                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5683                    # for ann in annotation_renamed_list:
5684                    for ann in annotation_list:
5685                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5686
5687                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5688
5689                # Command - indexing
5690                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5691
5692                log.debug(f"Annotation - Annovar command: {command_annovar}")
5693                run_parallel_commands([command_annovar], 1)
5694
5695                # Error messages
5696                log.info(f"Error/Warning messages:")
5697                error_message_command_all = []
5698                error_message_command_warning = []
5699                error_message_command_err = []
5700                for err_file in err_files:
5701                    with open(err_file, "r") as f:
5702                        for line in f:
5703                            message = line.strip()
5704                            error_message_command_all.append(message)
5705                            if line.startswith("[W::") or line.startswith("WARNING"):
5706                                error_message_command_warning.append(message)
5707                            if line.startswith("[E::") or line.startswith("ERROR"):
5708                                error_message_command_err.append(
5709                                    f"{err_file}: " + message
5710                                )
5711                # log info
5712                for message in list(
5713                    set(error_message_command_err + error_message_command_warning)
5714                ):
5715                    log.info(f"   {message}")
5716                # debug info
5717                for message in list(set(error_message_command_all)):
5718                    log.debug(f"   {message}")
5719                # failed
5720                if len(error_message_command_err):
5721                    log.error("Annotation failed: Error in commands")
5722                    raise ValueError("Annotation failed: Error in commands")
5723
5724            if tmp_annotates_vcf_name_list:
5725
5726                # List of annotated files
5727                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5728
5729                # Tmp file
5730                tmp_annotate_vcf = NamedTemporaryFile(
5731                    prefix=self.get_prefix(),
5732                    dir=self.get_tmp_dir(),
5733                    suffix=".vcf.gz",
5734                    delete=False,
5735                )
5736                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5737                tmp_files.append(tmp_annotate_vcf_name)
5738                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5739                err_files.append(tmp_annotate_vcf_name_err)
5740                tmp_files.append(tmp_annotate_vcf_name_err)
5741
5742                # Command merge
5743                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5744                log.info(
5745                    f"Annotation Annovar - Annotation merging "
5746                    + str(len(tmp_annotates_vcf_name_list))
5747                    + " annotated files"
5748                )
5749                log.debug(f"Annotation - merge command: {merge_command}")
5750                run_parallel_commands([merge_command], 1)
5751
5752                # Find annotation in header
5753                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5754                    header_list = self.read_vcf_header(f)
5755                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5756
5757                for ann in annovar_vcf_header.infos:
5758                    if ann not in self.get_header().infos:
5759                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5760
5761                # Update variants
5762                log.info(f"Annotation Annovar - Updating...")
5763                self.update_from_vcf(tmp_annotate_vcf_name)
5764
5765            # Clean files
5766            # Tmp file remove command
5767            if True:
5768                tmp_files_remove_command = ""
5769                if tmp_files:
5770                    tmp_files_remove_command = " ".join(tmp_files)
5771                clean_command = f" rm -f {tmp_files_remove_command} "
5772                log.debug(f"Annotation Annovar - Annotation cleaning ")
5773                log.debug(f"Annotation - cleaning command: {clean_command}")
5774                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5777    def annotation_parquet(self, threads: int = None) -> None:
5778        """
5779        It takes a VCF file, and annotates it with a parquet file
5780
5781        :param threads: number of threads to use for the annotation
5782        :return: the value of the variable "result".
5783        """
5784
5785        # DEBUG
5786        log.debug("Start annotation with parquet databases")
5787
5788        # Threads
5789        if not threads:
5790            threads = self.get_threads()
5791        log.debug("Threads: " + str(threads))
5792
5793        # DEBUG
5794        delete_tmp = True
5795        if self.get_config().get("verbosity", "warning") in ["debug"]:
5796            delete_tmp = False
5797            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5798
5799        # Config
5800        databases_folders = set(
5801            self.get_config()
5802            .get("folders", {})
5803            .get("databases", {})
5804            .get("annotations", ["."])
5805            + self.get_config()
5806            .get("folders", {})
5807            .get("databases", {})
5808            .get("parquet", ["."])
5809        )
5810        log.debug("Databases annotations: " + str(databases_folders))
5811
5812        # Param
5813        annotations = (
5814            self.get_param()
5815            .get("annotation", {})
5816            .get("parquet", {})
5817            .get("annotations", None)
5818        )
5819        log.debug("Annotations: " + str(annotations))
5820
5821        # Assembly
5822        assembly = self.get_param().get(
5823            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5824        )
5825
5826        # Force Update Annotation
5827        force_update_annotation = (
5828            self.get_param()
5829            .get("annotation", {})
5830            .get("options", {})
5831            .get("annotations_update", False)
5832        )
5833        log.debug(f"force_update_annotation={force_update_annotation}")
5834        force_append_annotation = (
5835            self.get_param()
5836            .get("annotation", {})
5837            .get("options", {})
5838            .get("annotations_append", False)
5839        )
5840        log.debug(f"force_append_annotation={force_append_annotation}")
5841
5842        # Data
5843        table_variants = self.get_table_variants()
5844
5845        # Check if not empty
5846        log.debug("Check if not empty")
5847        sql_query_chromosomes_df = self.get_query_to_df(
5848            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5849        )
5850        if not sql_query_chromosomes_df["count"][0]:
5851            log.info(f"VCF empty")
5852            return
5853
5854        # VCF header
5855        vcf_reader = self.get_header()
5856        log.debug("Initial header: " + str(vcf_reader.infos))
5857
5858        # Nb Variants POS
5859        log.debug("NB Variants Start")
5860        nb_variants = self.conn.execute(
5861            f"SELECT count(*) AS count FROM variants"
5862        ).fetchdf()["count"][0]
5863        log.debug("NB Variants Stop")
5864
5865        # Existing annotations
5866        for vcf_annotation in self.get_header().infos:
5867
5868            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5869            log.debug(
5870                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5871            )
5872
5873        # Added columns
5874        added_columns = []
5875
5876        # drop indexes
5877        log.debug(f"Drop indexes...")
5878        self.drop_indexes()
5879
5880        if annotations:
5881
5882            if "ALL" in annotations:
5883
5884                all_param = annotations.get("ALL", {})
5885                all_param_formats = all_param.get("formats", None)
5886                all_param_releases = all_param.get("releases", None)
5887
5888                databases_infos_dict = self.scan_databases(
5889                    database_formats=all_param_formats,
5890                    database_releases=all_param_releases,
5891                )
5892                for database_infos in databases_infos_dict.keys():
5893                    if database_infos not in annotations:
5894                        annotations[database_infos] = {"INFO": None}
5895
5896            for annotation in annotations:
5897
5898                if annotation in ["ALL"]:
5899                    continue
5900
5901                # Annotation Name
5902                annotation_name = os.path.basename(annotation)
5903
5904                # Annotation fields
5905                annotation_fields = annotations[annotation]
5906                if not annotation_fields:
5907                    annotation_fields = {"INFO": None}
5908
5909                log.debug(f"Annotation '{annotation_name}'")
5910                log.debug(
5911                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5912                )
5913
5914                # Create Database
5915                database = Database(
5916                    database=annotation,
5917                    databases_folders=databases_folders,
5918                    assembly=assembly,
5919                )
5920
5921                # Find files
5922                parquet_file = database.get_database()
5923                parquet_hdr_file = database.get_header_file()
5924                parquet_type = database.get_type()
5925
5926                # Check if files exists
5927                if not parquet_file or not parquet_hdr_file:
5928                    msg_err_list = []
5929                    if not parquet_file:
5930                        msg_err_list.append(
5931                            f"Annotation failed: Annotation file not found"
5932                        )
5933                    if parquet_file and not parquet_hdr_file:
5934                        msg_err_list.append(
5935                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
5936                        )
5937
5938                    log.error(". ".join(msg_err_list))
5939                    raise ValueError(". ".join(msg_err_list))
5940                else:
5941                    # Get parquet connexion
5942                    parquet_sql_attach = database.get_sql_database_attach(
5943                        output="query"
5944                    )
5945                    if parquet_sql_attach:
5946                        self.conn.execute(parquet_sql_attach)
5947                    parquet_file_link = database.get_sql_database_link()
5948                    # Log
5949                    log.debug(
5950                        f"Annotation '{annotation_name}' - file: "
5951                        + str(parquet_file)
5952                        + " and "
5953                        + str(parquet_hdr_file)
5954                    )
5955
5956                    # Database full header columns
5957                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5958                        parquet_hdr_file
5959                    )
5960                    # Log
5961                    log.debug(
5962                        "Annotation database header columns : "
5963                        + str(parquet_hdr_vcf_header_columns)
5964                    )
5965
5966                    # Load header as VCF object
5967                    parquet_hdr_vcf_header_infos = database.get_header().infos
5968                    # Log
5969                    log.debug(
5970                        "Annotation database header: "
5971                        + str(parquet_hdr_vcf_header_infos)
5972                    )
5973
5974                    # Get extra infos
5975                    parquet_columns = database.get_extra_columns()
5976                    # Log
5977                    log.debug("Annotation database Columns: " + str(parquet_columns))
5978
5979                    # Add extra columns if "ALL" in annotation_fields
5980                    # if "ALL" in annotation_fields:
5981                    #     allow_add_extra_column = True
5982                    if "ALL" in annotation_fields and database.get_extra_columns():
5983                        for extra_column in database.get_extra_columns():
5984                            if (
5985                                extra_column not in annotation_fields
5986                                and extra_column.replace("INFO/", "")
5987                                not in parquet_hdr_vcf_header_infos
5988                            ):
5989                                parquet_hdr_vcf_header_infos[extra_column] = (
5990                                    vcf.parser._Info(
5991                                        extra_column,
5992                                        ".",
5993                                        "String",
5994                                        f"{extra_column} description",
5995                                        "unknown",
5996                                        "unknown",
5997                                        self.code_type_map["String"],
5998                                    )
5999                                )
6000
6001                    # For all fields in database
6002                    annotation_fields_all = False
6003                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
6004                        annotation_fields_all = True
6005                        annotation_fields = {
6006                            key: key for key in parquet_hdr_vcf_header_infos
6007                        }
6008
6009                        log.debug(
6010                            "Annotation database header - All annotations added: "
6011                            + str(annotation_fields)
6012                        )
6013
6014                    # Init
6015
6016                    # List of annotation fields to use
6017                    sql_query_annotation_update_info_sets = []
6018
6019                    # List of annotation to agregate
6020                    sql_query_annotation_to_agregate = []
6021
6022                    # Number of fields
6023                    nb_annotation_field = 0
6024
6025                    # Annotation fields processed
6026                    annotation_fields_processed = []
6027
6028                    # Columns mapping
6029                    map_columns = database.map_columns(
6030                        columns=annotation_fields, prefixes=["INFO/"]
6031                    )
6032
6033                    # Query dict for fields to remove (update option)
6034                    query_dict_remove = {}
6035
6036                    # Fetch Anotation fields
6037                    for annotation_field in annotation_fields:
6038
6039                        # annotation_field_column
6040                        annotation_field_column = map_columns.get(
6041                            annotation_field, "INFO"
6042                        )
6043
6044                        # field new name, if parametered
6045                        annotation_fields_new_name = annotation_fields.get(
6046                            annotation_field, annotation_field
6047                        )
6048                        if not annotation_fields_new_name:
6049                            annotation_fields_new_name = annotation_field
6050
6051                        # To annotate
6052                        # force_update_annotation = True
6053                        # force_append_annotation = True
6054                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
6055                        if annotation_field in parquet_hdr_vcf_header_infos and (
6056                            force_update_annotation
6057                            or force_append_annotation
6058                            or (
6059                                annotation_fields_new_name
6060                                not in self.get_header().infos
6061                            )
6062                        ):
6063
6064                            # Add field to annotation to process list
6065                            annotation_fields_processed.append(
6066                                annotation_fields_new_name
6067                            )
6068
6069                            # explode infos for the field
6070                            annotation_fields_new_name_info_msg = ""
6071                            if (
6072                                force_update_annotation
6073                                and annotation_fields_new_name
6074                                in self.get_header().infos
6075                            ):
6076                                # Remove field from INFO
6077                                query = f"""
6078                                    UPDATE {table_variants} as table_variants
6079                                    SET INFO = REGEXP_REPLACE(
6080                                                concat(table_variants.INFO,''),
6081                                                ';*{annotation_fields_new_name}=[^;]*',
6082                                                ''
6083                                                )
6084                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
6085                                """
6086                                annotation_fields_new_name_info_msg = " [update]"
6087                                query_dict_remove[
6088                                    f"remove 'INFO/{annotation_fields_new_name}'"
6089                                ] = query
6090
6091                            # Sep between fields in INFO
6092                            nb_annotation_field += 1
6093                            if nb_annotation_field > 1:
6094                                annotation_field_sep = ";"
6095                            else:
6096                                annotation_field_sep = ""
6097
6098                            log.info(
6099                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
6100                            )
6101
6102                            # Add INFO field to header
6103                            parquet_hdr_vcf_header_infos_number = (
6104                                parquet_hdr_vcf_header_infos[annotation_field].num
6105                                or "."
6106                            )
6107                            parquet_hdr_vcf_header_infos_type = (
6108                                parquet_hdr_vcf_header_infos[annotation_field].type
6109                                or "String"
6110                            )
6111                            parquet_hdr_vcf_header_infos_description = (
6112                                parquet_hdr_vcf_header_infos[annotation_field].desc
6113                                or f"{annotation_field} description"
6114                            )
6115                            parquet_hdr_vcf_header_infos_source = (
6116                                parquet_hdr_vcf_header_infos[annotation_field].source
6117                                or "unknown"
6118                            )
6119                            parquet_hdr_vcf_header_infos_version = (
6120                                parquet_hdr_vcf_header_infos[annotation_field].version
6121                                or "unknown"
6122                            )
6123
6124                            vcf_reader.infos[annotation_fields_new_name] = (
6125                                vcf.parser._Info(
6126                                    annotation_fields_new_name,
6127                                    parquet_hdr_vcf_header_infos_number,
6128                                    parquet_hdr_vcf_header_infos_type,
6129                                    parquet_hdr_vcf_header_infos_description,
6130                                    parquet_hdr_vcf_header_infos_source,
6131                                    parquet_hdr_vcf_header_infos_version,
6132                                    self.code_type_map[
6133                                        parquet_hdr_vcf_header_infos_type
6134                                    ],
6135                                )
6136                            )
6137
6138                            # Append
6139                            if force_append_annotation:
6140                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
6141                            else:
6142                                query_case_when_append = ""
6143
6144                            # Annotation/Update query fields
6145                            # Found in INFO column
6146                            if (
6147                                annotation_field_column == "INFO"
6148                                and "INFO" in parquet_hdr_vcf_header_columns
6149                            ):
6150                                sql_query_annotation_update_info_sets.append(
6151                                    f"""
6152                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
6153                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
6154                                        ELSE ''
6155                                    END
6156                                """
6157                                )
6158                            # Found in a specific column
6159                            else:
6160                                sql_query_annotation_update_info_sets.append(
6161                                    f"""
6162                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
6163                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
6164                                        ELSE ''
6165                                    END
6166                                """
6167                                )
6168                                sql_query_annotation_to_agregate.append(
6169                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
6170                                )
6171
6172                        # Not to annotate
6173                        else:
6174
6175                            if force_update_annotation:
6176                                annotation_message = "forced"
6177                            else:
6178                                annotation_message = "skipped"
6179
6180                            if annotation_field not in parquet_hdr_vcf_header_infos:
6181                                log.warning(
6182                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
6183                                )
6184                            if annotation_fields_new_name in self.get_header().infos:
6185                                log.warning(
6186                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
6187                                )
6188
6189                    # Check if ALL fields have to be annotated. Thus concat all INFO field
6190                    # allow_annotation_full_info = True
6191                    allow_annotation_full_info = not force_append_annotation
6192
6193                    if parquet_type in ["regions"]:
6194                        allow_annotation_full_info = False
6195
6196                    if (
6197                        allow_annotation_full_info
6198                        and nb_annotation_field == len(annotation_fields)
6199                        and annotation_fields_all
6200                        and (
6201                            "INFO" in parquet_hdr_vcf_header_columns
6202                            and "INFO" in database.get_extra_columns()
6203                        )
6204                    ):
6205                        log.debug("Column INFO annotation enabled")
6206                        sql_query_annotation_update_info_sets = []
6207                        sql_query_annotation_update_info_sets.append(
6208                            f" table_parquet.INFO "
6209                        )
6210
6211                    if sql_query_annotation_update_info_sets:
6212
6213                        # Annotate
6214                        log.info(f"Annotation '{annotation_name}' - Annotation...")
6215
6216                        # Join query annotation update info sets for SQL
6217                        sql_query_annotation_update_info_sets_sql = ",".join(
6218                            sql_query_annotation_update_info_sets
6219                        )
6220
6221                        # Check chromosomes list (and variants infos)
6222                        sql_query_chromosomes = f"""
6223                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
6224                            FROM {table_variants} as table_variants
6225                            GROUP BY table_variants."#CHROM"
6226                            ORDER BY table_variants."#CHROM"
6227                            """
6228                        sql_query_chromosomes_df = self.conn.execute(
6229                            sql_query_chromosomes
6230                        ).df()
6231                        sql_query_chromosomes_dict = {
6232                            entry["CHROM"]: {
6233                                "count": entry["count_variants"],
6234                                "min": entry["min_variants"],
6235                                "max": entry["max_variants"],
6236                            }
6237                            for index, entry in sql_query_chromosomes_df.iterrows()
6238                        }
6239
6240                        # Init
6241                        nb_of_query = 0
6242                        nb_of_variant_annotated = 0
6243                        query_dict = query_dict_remove
6244
6245                        # for chrom in sql_query_chromosomes_df["CHROM"]:
6246                        for chrom in sql_query_chromosomes_dict:
6247
6248                            # Number of variant by chromosome
6249                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
6250                                chrom, {}
6251                            ).get("count", 0)
6252
6253                            log.debug(
6254                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
6255                            )
6256
6257                            # Annotation with regions database
6258                            if parquet_type in ["regions"]:
6259                                sql_query_annotation_from_clause = f"""
6260                                    FROM (
6261                                        SELECT 
6262                                            '{chrom}' AS \"#CHROM\",
6263                                            table_variants_from.\"POS\" AS \"POS\",
6264                                            {",".join(sql_query_annotation_to_agregate)}
6265                                        FROM {table_variants} as table_variants_from
6266                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
6267                                            table_parquet_from."#CHROM" = '{chrom}'
6268                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
6269                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
6270                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
6271                                                )
6272                                        )
6273                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
6274                                        GROUP BY table_variants_from.\"POS\"
6275                                        )
6276                                        as table_parquet
6277                                """
6278
6279                                sql_query_annotation_where_clause = """
6280                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
6281                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6282                                """
6283
6284                            # Annotation with variants database
6285                            else:
6286                                sql_query_annotation_from_clause = f"""
6287                                    FROM {parquet_file_link} as table_parquet
6288                                """
6289                                sql_query_annotation_where_clause = f"""
6290                                    table_variants."#CHROM" = '{chrom}'
6291                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
6292                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6293                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
6294                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
6295                                """
6296
6297                            # Create update query
6298                            sql_query_annotation_chrom_interval_pos = f"""
6299                                UPDATE {table_variants} as table_variants
6300                                    SET INFO = 
6301                                        concat(
6302                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6303                                                THEN table_variants.INFO
6304                                                ELSE ''
6305                                            END
6306                                            ,
6307                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6308                                                        AND (
6309                                                        concat({sql_query_annotation_update_info_sets_sql})
6310                                                        )
6311                                                        NOT IN ('','.') 
6312                                                    THEN ';'
6313                                                    ELSE ''
6314                                            END
6315                                            ,
6316                                            {sql_query_annotation_update_info_sets_sql}
6317                                            )
6318                                    {sql_query_annotation_from_clause}
6319                                    WHERE {sql_query_annotation_where_clause}
6320                                    ;
6321                                """
6322
6323                            # Add update query to dict
6324                            query_dict[
6325                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
6326                            ] = sql_query_annotation_chrom_interval_pos
6327
6328                        nb_of_query = len(query_dict)
6329                        num_query = 0
6330
6331                        # SET max_expression_depth TO x
6332                        self.conn.execute("SET max_expression_depth TO 10000")
6333
6334                        for query_name in query_dict:
6335                            query = query_dict[query_name]
6336                            num_query += 1
6337                            log.info(
6338                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
6339                            )
6340                            result = self.conn.execute(query)
6341                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
6342                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
6343                            log.info(
6344                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
6345                            )
6346
6347                        log.info(
6348                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
6349                        )
6350
6351                    else:
6352
6353                        log.info(
6354                            f"Annotation '{annotation_name}' - No Annotations available"
6355                        )
6356
6357                    log.debug("Final header: " + str(vcf_reader.infos))
6358
6359        # Remove added columns
6360        for added_column in added_columns:
6361            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
6363    def annotation_splice(self, threads: int = None) -> None:
6364        """
6365        This function annotate with snpEff
6366
6367        :param threads: The number of threads to use
6368        :return: the value of the variable "return_value".
6369        """
6370
6371        # DEBUG
6372        log.debug("Start annotation with splice tools")
6373
6374        # Threads
6375        if not threads:
6376            threads = self.get_threads()
6377        log.debug("Threads: " + str(threads))
6378
6379        # DEBUG
6380        delete_tmp = True
6381        if self.get_config().get("verbosity", "warning") in ["debug"]:
6382            delete_tmp = False
6383            log.debug("Delete tmp files/folders: " + str(delete_tmp))
6384
6385        # Config
6386        config = self.get_config()
6387        log.debug("Config: " + str(config))
6388        splice_config = config.get("tools", {}).get("splice", {})
6389        if not splice_config:
6390            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
6391            msg_err = "No Splice tool config"
6392            raise ValueError(msg_err)
6393        log.debug(f"splice_config: {splice_config}")
6394
6395        # Config - Folders - Databases
6396        databases_folders = (
6397            config.get("folders", {}).get("databases", {}).get("splice", ["."])
6398        )
6399        log.debug("Databases annotations: " + str(databases_folders))
6400
6401        # Splice docker image
6402        splice_docker_image = splice_config.get("docker").get("image")
6403
6404        # Pull splice image if it's not already there
6405        if not check_docker_image_exists(splice_docker_image):
6406            log.warning(
6407                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
6408            )
6409            try:
6410                command(f"docker pull {splice_config.get('docker').get('image')}")
6411            except subprocess.CalledProcessError:
6412                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
6413                log.error(msg_err)
6414                raise ValueError(msg_err)
6415
6416        # Config - splice databases
6417        splice_databases = (
6418            config.get("folders", {})
6419            .get("databases", {})
6420            .get("splice", DEFAULT_SPLICE_FOLDER)
6421        )
6422        splice_databases = full_path(splice_databases)
6423
6424        # Param
6425        param = self.get_param()
6426        log.debug("Param: " + str(param))
6427
6428        # Param
6429        options = param.get("annotation", {}).get("splice", {}).get("options", {})
6430        log.debug("Options: " + str(options))
6431
6432        # Data
6433        table_variants = self.get_table_variants()
6434
6435        # Check if not empty
6436        log.debug("Check if not empty")
6437        sql_query_chromosomes = (
6438            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
6439        )
6440        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6441            log.info("VCF empty")
6442            return None
6443
6444        # Export in VCF
6445        log.debug("Create initial file to annotate")
6446
6447        # Create output folder / work folder
6448        if options.get("output_folder", ""):
6449            output_folder = options.get("output_folder", "")
6450            if not os.path.exists(output_folder):
6451                Path(output_folder).mkdir(parents=True, exist_ok=True)
6452        else:
6453            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6454            if not os.path.exists(output_folder):
6455                Path(output_folder).mkdir(parents=True, exist_ok=True)
6456
6457        if options.get("workdir", ""):
6458            workdir = options.get("workdir", "")
6459        else:
6460            workdir = "/work"
6461
6462        # Create tmp VCF file
6463        tmp_vcf = NamedTemporaryFile(
6464            prefix=self.get_prefix(),
6465            dir=output_folder,
6466            suffix=".vcf",
6467            delete=False,
6468        )
6469        tmp_vcf_name = tmp_vcf.name
6470
6471        # VCF header
6472        header = self.get_header()
6473
6474        # Existing annotations
6475        for vcf_annotation in self.get_header().infos:
6476
6477            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6478            log.debug(
6479                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6480            )
6481
6482        # Memory limit
6483        if config.get("memory", None):
6484            memory_limit = config.get("memory", "8G").upper()
6485            # upper()
6486        else:
6487            memory_limit = "8G"
6488        log.debug(f"memory_limit: {memory_limit}")
6489
6490        # Check number of variants to annotate
6491        where_clause_regex_spliceai = r"SpliceAI_\w+"
6492        where_clause_regex_spip = r"SPiP_\w+"
6493        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6494        df_list_of_variants_to_annotate = self.get_query_to_df(
6495            query=f""" SELECT * FROM variants {where_clause} """
6496        )
6497        if len(df_list_of_variants_to_annotate) == 0:
6498            log.warning(
6499                f"No variants to annotate with splice. Variants probably already annotated with splice"
6500            )
6501            return None
6502        else:
6503            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6504
6505        # Export VCF file
6506        self.export_variant_vcf(
6507            vcf_file=tmp_vcf_name,
6508            remove_info=True,
6509            add_samples=True,
6510            index=False,
6511            where_clause=where_clause,
6512        )
6513        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
6514        if any(value for value in splice_config.values() if value is None):
6515            log.warning("At least one splice config parameter is empty")
6516            # exit annotation_splice
6517            return None
6518
6519        # Params in splice nf
6520        def check_values(dico: dict):
6521            """
6522            Ensure parameters for NF splice pipeline
6523            """
6524            for key, val in dico.items():
6525                if key == "genome":
6526                    if any(
6527                        assemb in options.get("genome", {})
6528                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6529                    ):
6530                        yield f"--{key} hg19"
6531                    elif any(
6532                        assemb in options.get("genome", {})
6533                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6534                    ):
6535                        yield f"--{key} hg38"
6536                elif (
6537                    (isinstance(val, str) and val)
6538                    or isinstance(val, int)
6539                    or isinstance(val, bool)
6540                ):
6541                    yield f"--{key} {val}"
6542
6543        # Genome
6544        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6545        options["genome"] = genome
6546        # NF params
6547        nf_params = []
6548        # Add options
6549        if options:
6550            log.debug(options)
6551            nf_params = list(check_values(options))
6552            log.debug(f"Splice NF params: {' '.join(nf_params)}")
6553        else:
6554            log.debug("No NF params provided")
6555        # Add threads
6556        if "threads" not in options.keys():
6557            nf_params.append(f"--threads {threads}")
6558        # Genome path
6559        genome_path = find_genome(
6560            config.get("folders", {})
6561            .get("databases", {})
6562            .get("genomes", DEFAULT_GENOME_FOLDER),
6563            file=f"{genome}.fa",
6564        )
6565        # Add genome path
6566        if not genome_path:
6567            raise ValueError(
6568                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6569            )
6570        else:
6571            log.debug(f"Genome: {genome_path}")
6572            nf_params.append(f"--genome_path {genome_path}")
6573
6574        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6575            """
6576            Setting up updated databases for SPiP and SpliceAI
6577            """
6578
6579            try:
6580
6581                # SpliceAI assembly transcriptome
6582                spliceai_assembly = os.path.join(
6583                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
6584                    options.get("genome"),
6585                    "transcriptome",
6586                )
6587                spip_assembly = options.get("genome")
6588
6589                spip = find(
6590                    f"transcriptome_{spip_assembly}.RData",
6591                    config.get("folders", {}).get("databases", {}).get("spip", {}),
6592                )
6593                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6594                log.debug(f"SPiP annotations: {spip}")
6595                log.debug(f"SpliceAI annotations: {spliceai}")
6596                if spip and spliceai:
6597                    return [
6598                        f"--spip_transcriptome {spip}",
6599                        f"--spliceai_transcriptome {spliceai}",
6600                    ]
6601                else:
6602                    log.warning(
6603                        "Can't find splice databases in configuration, use annotations file from image"
6604                    )
6605            except TypeError:
6606                log.warning(
6607                    "Can't find splice databases in configuration, use annotations file from image"
6608                )
6609                return []
6610
6611        # Add options, check if transcriptome option have already beend provided
6612        if (
6613            "spip_transcriptome" not in nf_params
6614            and "spliceai_transcriptome" not in nf_params
6615        ):
6616            splice_reference = splice_annotations(options, config)
6617            if splice_reference:
6618                nf_params.extend(splice_reference)
6619        # nf_params.append(f"--output_folder {output_folder}")
6620        random_uuid = f"HOWARD-SPLICE-{get_random()}"
6621        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6622        log.debug(cmd)
6623        splice_config["docker"]["command"] = cmd
6624
6625        # Ensure proxy is set
6626        proxy = [
6627            f"-e {var}={os.getenv(var)}"
6628            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
6629            if os.getenv(var) is not None
6630        ]
6631        docker_cmd = get_bin_command(
6632            tool="splice",
6633            bin_type="docker",
6634            config=config,
6635            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6636            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
6637        )
6638        # print(docker_cmd)
6639        # exit()
6640        # Docker debug
6641        # if splice_config.get("rm_container"):
6642        #     rm_container = "--rm"
6643        # else:
6644        #     rm_container = ""
6645        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6646        log.debug(docker_cmd)
6647        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6648        log.debug(res.stdout)
6649        if res.stderr:
6650            log.error(res.stderr)
6651        res.check_returncode()
6652        # Update variants
6653        log.info("Annotation - Updating...")
6654        # Test find output vcf
6655        log.debug(
6656            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6657        )
6658        output_vcf = []
6659        # Wrong folder to look in
6660        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6661            if (
6662                files
6663                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6664            ):
6665                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6666        # log.debug(os.listdir(options.get("output_folder")))
6667        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6668        if not output_vcf:
6669            log.debug(
6670                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6671            )
6672        else:
6673            # Get new header from annotated vcf
6674            log.debug(f"Initial header: {len(header.infos)} fields")
6675            # Create new header with splice infos
6676            new_vcf = Variants(input=output_vcf[0])
6677            new_vcf_header = new_vcf.get_header().infos
6678            for keys, infos in new_vcf_header.items():
6679                if keys not in header.infos.keys():
6680                    header.infos[keys] = infos
6681            log.debug(f"New header: {len(header.infos)} fields")
6682            log.debug(f"Splice tmp output: {output_vcf[0]}")
6683            self.update_from_vcf(output_vcf[0])
6684
6685        # Remove file
6686        remove_if_exists(output_vcf)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6692    def get_config_default(self, name: str) -> dict:
6693        """
6694        The function `get_config_default` returns a dictionary containing default configurations for
6695        various calculations and prioritizations.
6696
6697        :param name: The `get_config_default` function returns a dictionary containing default
6698        configurations for different calculations and prioritizations. The `name` parameter is used to
6699        specify which specific configuration to retrieve from the dictionary
6700        :type name: str
6701        :return: The function `get_config_default` returns a dictionary containing default configuration
6702        settings for different calculations and prioritizations. The specific configuration settings are
6703        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6704        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6705        returned. If there is no match, an empty dictionary is returned.
6706        """
6707
6708        config_default = {
6709            "calculations": {
6710                "variant_chr_pos_alt_ref": {
6711                    "type": "sql",
6712                    "name": "variant_chr_pos_alt_ref",
6713                    "description": "Create a variant ID with chromosome, position, alt and ref",
6714                    "available": False,
6715                    "output_column_name": "variant_chr_pos_alt_ref",
6716                    "output_column_type": "String",
6717                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6718                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6719                    "operation_info": True,
6720                },
6721                "VARTYPE": {
6722                    "type": "sql",
6723                    "name": "VARTYPE",
6724                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6725                    "available": True,
6726                    "output_column_name": "VARTYPE",
6727                    "output_column_type": "String",
6728                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6729                    "operation_query": """
6730                            CASE
6731                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6732                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6733                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6734                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6735                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6736                                ELSE 'UNDEFINED'
6737                            END
6738                            """,
6739                    "info_fields": ["SVTYPE"],
6740                    "operation_info": True,
6741                },
6742                "snpeff_hgvs": {
6743                    "type": "python",
6744                    "name": "snpeff_hgvs",
6745                    "description": "HGVS nomenclatures from snpEff annotation",
6746                    "available": True,
6747                    "function_name": "calculation_extract_snpeff_hgvs",
6748                    "function_params": ["snpeff_hgvs", "ANN"],
6749                },
6750                "snpeff_ann_explode": {
6751                    "type": "python",
6752                    "name": "snpeff_ann_explode",
6753                    "description": "Explode snpEff annotations with uniquify values",
6754                    "available": True,
6755                    "function_name": "calculation_snpeff_ann_explode",
6756                    "function_params": [False, "fields", "snpeff_", "ANN"],
6757                },
6758                "snpeff_ann_explode_uniquify": {
6759                    "type": "python",
6760                    "name": "snpeff_ann_explode_uniquify",
6761                    "description": "Explode snpEff annotations",
6762                    "available": True,
6763                    "function_name": "calculation_snpeff_ann_explode",
6764                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6765                },
6766                "snpeff_ann_explode_json": {
6767                    "type": "python",
6768                    "name": "snpeff_ann_explode_json",
6769                    "description": "Explode snpEff annotations in JSON format",
6770                    "available": True,
6771                    "function_name": "calculation_snpeff_ann_explode",
6772                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6773                },
6774                "NOMEN": {
6775                    "type": "python",
6776                    "name": "NOMEN",
6777                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
6778                    "available": True,
6779                    "function_name": "calculation_extract_nomen",
6780                    "function_params": [],
6781                },
6782                "FINDBYPIPELINE": {
6783                    "type": "python",
6784                    "name": "FINDBYPIPELINE",
6785                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6786                    "available": True,
6787                    "function_name": "calculation_find_by_pipeline",
6788                    "function_params": ["findbypipeline"],
6789                },
6790                "FINDBYSAMPLE": {
6791                    "type": "python",
6792                    "name": "FINDBYSAMPLE",
6793                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6794                    "available": True,
6795                    "function_name": "calculation_find_by_pipeline",
6796                    "function_params": ["findbysample"],
6797                },
6798                "GENOTYPECONCORDANCE": {
6799                    "type": "python",
6800                    "name": "GENOTYPECONCORDANCE",
6801                    "description": "Concordance of genotype for multi caller VCF",
6802                    "available": True,
6803                    "function_name": "calculation_genotype_concordance",
6804                    "function_params": [],
6805                },
6806                "BARCODE": {
6807                    "type": "python",
6808                    "name": "BARCODE",
6809                    "description": "BARCODE as VaRank tool",
6810                    "available": True,
6811                    "function_name": "calculation_barcode",
6812                    "function_params": [],
6813                },
6814                "BARCODEFAMILY": {
6815                    "type": "python",
6816                    "name": "BARCODEFAMILY",
6817                    "description": "BARCODEFAMILY as VaRank tool",
6818                    "available": True,
6819                    "function_name": "calculation_barcode_family",
6820                    "function_params": ["BCF"],
6821                },
6822                "TRIO": {
6823                    "type": "python",
6824                    "name": "TRIO",
6825                    "description": "Inheritance for a trio family",
6826                    "available": True,
6827                    "function_name": "calculation_trio",
6828                    "function_params": [],
6829                },
6830                "VAF": {
6831                    "type": "python",
6832                    "name": "VAF",
6833                    "description": "Variant Allele Frequency (VAF) harmonization",
6834                    "available": True,
6835                    "function_name": "calculation_vaf_normalization",
6836                    "function_params": [],
6837                },
6838                "VAF_stats": {
6839                    "type": "python",
6840                    "name": "VAF_stats",
6841                    "description": "Variant Allele Frequency (VAF) statistics",
6842                    "available": True,
6843                    "function_name": "calculation_genotype_stats",
6844                    "function_params": ["VAF"],
6845                },
6846                "DP_stats": {
6847                    "type": "python",
6848                    "name": "DP_stats",
6849                    "description": "Depth (DP) statistics",
6850                    "available": True,
6851                    "function_name": "calculation_genotype_stats",
6852                    "function_params": ["DP"],
6853                },
6854                "variant_id": {
6855                    "type": "python",
6856                    "name": "variant_id",
6857                    "description": "Variant ID generated from variant position and type",
6858                    "available": True,
6859                    "function_name": "calculation_variant_id",
6860                    "function_params": [],
6861                },
6862                "transcripts_json": {
6863                    "type": "python",
6864                    "name": "transcripts_json",
6865                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
6866                    "available": True,
6867                    "function_name": "calculation_transcripts_annotation",
6868                    "function_params": ["transcripts_json", None],
6869                },
6870                "transcripts_ann": {
6871                    "type": "python",
6872                    "name": "transcripts_ann",
6873                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
6874                    "available": True,
6875                    "function_name": "calculation_transcripts_annotation",
6876                    "function_params": [None, "transcripts_ann"],
6877                },
6878                "transcripts_annotations": {
6879                    "type": "python",
6880                    "name": "transcripts_annotations",
6881                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
6882                    "available": True,
6883                    "function_name": "calculation_transcripts_annotation",
6884                    "function_params": [None, None],
6885                },
6886                "transcripts_prioritization": {
6887                    "type": "python",
6888                    "name": "transcripts_prioritization",
6889                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6890                    "available": True,
6891                    "function_name": "calculation_transcripts_prioritization",
6892                    "function_params": [],
6893                },
6894                "transcripts_export": {
6895                    "type": "python",
6896                    "name": "transcripts_export",
6897                    "description": "Export transcripts table/view as a file (using param.json)",
6898                    "available": True,
6899                    "function_name": "calculation_transcripts_export",
6900                    "function_params": [],
6901                },
6902            },
6903            "prioritizations": {
6904                "default": {
6905                    "ANN2": [
6906                        {
6907                            "type": "contains",
6908                            "value": "HIGH",
6909                            "score": 5,
6910                            "flag": "PASS",
6911                            "comment": [
6912                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6913                            ],
6914                        },
6915                        {
6916                            "type": "contains",
6917                            "value": "MODERATE",
6918                            "score": 3,
6919                            "flag": "PASS",
6920                            "comment": [
6921                                "A non-disruptive variant that might change protein effectiveness"
6922                            ],
6923                        },
6924                        {
6925                            "type": "contains",
6926                            "value": "LOW",
6927                            "score": 0,
6928                            "flag": "FILTERED",
6929                            "comment": [
6930                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6931                            ],
6932                        },
6933                        {
6934                            "type": "contains",
6935                            "value": "MODIFIER",
6936                            "score": 0,
6937                            "flag": "FILTERED",
6938                            "comment": [
6939                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6940                            ],
6941                        },
6942                    ],
6943                }
6944            },
6945        }
6946
6947        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
6949    def get_config_json(
6950        self, name: str, config_dict: dict = {}, config_file: str = None
6951    ) -> dict:
6952        """
6953        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6954        default values, a dictionary, and a file.
6955
6956        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6957        the name of the configuration. It is used to identify and retrieve the configuration settings
6958        for a specific component or module
6959        :type name: str
6960        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6961        dictionary that allows you to provide additional configuration settings or overrides. When you
6962        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6963        the key is the configuration setting you want to override or
6964        :type config_dict: dict
6965        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6966        specify the path to a configuration file that contains additional settings. If provided, the
6967        function will read the contents of this file and update the configuration dictionary with the
6968        values found in the file, overriding any existing values with the
6969        :type config_file: str
6970        :return: The function `get_config_json` returns a dictionary containing the configuration
6971        settings.
6972        """
6973
6974        # Create with default prioritizations
6975        config_default = self.get_config_default(name=name)
6976        configuration = config_default
6977        # log.debug(f"configuration={configuration}")
6978
6979        # Replace prioritizations from dict
6980        for config in config_dict:
6981            configuration[config] = config_dict[config]
6982
6983        # Replace prioritizations from file
6984        config_file = full_path(config_file)
6985        if config_file:
6986            if os.path.exists(config_file):
6987                with open(config_file) as config_file_content:
6988                    config_file_dict = json.load(config_file_content)
6989                for config in config_file_dict:
6990                    configuration[config] = config_file_dict[config]
6991            else:
6992                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6993                log.error(msg_error)
6994                raise ValueError(msg_error)
6995
6996        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
6998    def prioritization(
6999        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
7000    ) -> bool:
7001        """
7002        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
7003        prioritizes variants based on configured profiles and criteria.
7004
7005        :param table: The `table` parameter in the `prioritization` function is used to specify the name
7006        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
7007        a table name is provided, the method will prioritize the variants in that specific table
7008        :type table: str
7009        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
7010        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
7011        provided, the code will use a default prefix value of "PZ"
7012        :type pz_prefix: str
7013        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
7014        additional parameters specific to the prioritization process. These parameters can include
7015        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
7016        configurations needed for the prioritization of variants in a V
7017        :type pz_param: dict
7018        :return: A boolean value (True) is being returned from the `prioritization` function.
7019        """
7020
7021        # Config
7022        config = self.get_config()
7023
7024        # Param
7025        param = self.get_param()
7026
7027        # Prioritization param
7028        if pz_param is not None:
7029            prioritization_param = pz_param
7030        else:
7031            prioritization_param = param.get("prioritization", {})
7032
7033        # Configuration profiles
7034        prioritization_config_file = prioritization_param.get(
7035            "prioritization_config", None
7036        )
7037        prioritization_config_file = full_path(prioritization_config_file)
7038        prioritizations_config = self.get_config_json(
7039            name="prioritizations", config_file=prioritization_config_file
7040        )
7041
7042        # Prioritization prefix
7043        pz_prefix_default = "PZ"
7044        if pz_prefix is None:
7045            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
7046
7047        # Prioritization options
7048        profiles = prioritization_param.get("profiles", [])
7049        if isinstance(profiles, str):
7050            profiles = profiles.split(",")
7051        pzfields = prioritization_param.get(
7052            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
7053        )
7054        if isinstance(pzfields, str):
7055            pzfields = pzfields.split(",")
7056        default_profile = prioritization_param.get("default_profile", None)
7057        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
7058        prioritization_score_mode = prioritization_param.get(
7059            "prioritization_score_mode", "HOWARD"
7060        )
7061
7062        # Quick Prioritizations
7063        prioritizations = param.get("prioritizations", None)
7064        if prioritizations:
7065            log.info("Quick Prioritization:")
7066            for profile in prioritizations.split(","):
7067                if profile not in profiles:
7068                    profiles.append(profile)
7069                    log.info(f"   {profile}")
7070
7071        # If profile "ALL" provided, all profiles in the config profiles
7072        if "ALL" in profiles:
7073            profiles = list(prioritizations_config.keys())
7074
7075        for profile in profiles:
7076            if prioritizations_config.get(profile, None):
7077                log.debug(f"Profile '{profile}' configured")
7078            else:
7079                msg_error = f"Profile '{profile}' NOT configured"
7080                log.error(msg_error)
7081                raise ValueError(msg_error)
7082
7083        if profiles:
7084            log.info(f"Prioritization... ")
7085        else:
7086            log.debug(f"No profile defined")
7087            return False
7088
7089        if not default_profile and len(profiles):
7090            default_profile = profiles[0]
7091
7092        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
7093        log.debug("Profiles to check: " + str(list(profiles)))
7094
7095        # Variables
7096        if table is not None:
7097            table_variants = table
7098        else:
7099            table_variants = self.get_table_variants(clause="update")
7100        log.debug(f"Table to prioritize: {table_variants}")
7101
7102        # Added columns
7103        added_columns = []
7104
7105        # Create list of PZfields
7106        # List of PZFields
7107        list_of_pzfields_original = pzfields + [
7108            pzfield + pzfields_sep + profile
7109            for pzfield in pzfields
7110            for profile in profiles
7111        ]
7112        list_of_pzfields = []
7113        log.debug(f"{list_of_pzfields_original}")
7114
7115        # Remove existing PZfields to use if exists
7116        for pzfield in list_of_pzfields_original:
7117            if self.get_header().infos.get(pzfield, None) is None:
7118                list_of_pzfields.append(pzfield)
7119                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
7120            else:
7121                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
7122
7123        if list_of_pzfields:
7124
7125            # Explode Infos prefix
7126            explode_infos_prefix = self.get_explode_infos_prefix()
7127
7128            # PZfields tags description
7129            PZfields_INFOS = {
7130                f"{pz_prefix}Tags": {
7131                    "ID": f"{pz_prefix}Tags",
7132                    "Number": ".",
7133                    "Type": "String",
7134                    "Description": "Variant tags based on annotation criteria",
7135                },
7136                f"{pz_prefix}Score": {
7137                    "ID": f"{pz_prefix}Score",
7138                    "Number": 1,
7139                    "Type": "Integer",
7140                    "Description": "Variant score based on annotation criteria",
7141                },
7142                f"{pz_prefix}Flag": {
7143                    "ID": f"{pz_prefix}Flag",
7144                    "Number": 1,
7145                    "Type": "String",
7146                    "Description": "Variant flag based on annotation criteria",
7147                },
7148                f"{pz_prefix}Comment": {
7149                    "ID": f"{pz_prefix}Comment",
7150                    "Number": ".",
7151                    "Type": "String",
7152                    "Description": "Variant comment based on annotation criteria",
7153                },
7154                f"{pz_prefix}Infos": {
7155                    "ID": f"{pz_prefix}Infos",
7156                    "Number": ".",
7157                    "Type": "String",
7158                    "Description": "Variant infos based on annotation criteria",
7159                },
7160                f"{pz_prefix}Class": {
7161                    "ID": f"{pz_prefix}Class",
7162                    "Number": ".",
7163                    "Type": "String",
7164                    "Description": "Variant class based on annotation criteria",
7165                },
7166            }
7167
7168            # Create INFO fields if not exist
7169            for field in PZfields_INFOS:
7170                field_ID = PZfields_INFOS[field]["ID"]
7171                field_description = PZfields_INFOS[field]["Description"]
7172                if field_ID not in self.get_header().infos and field_ID in pzfields:
7173                    field_description = (
7174                        PZfields_INFOS[field]["Description"]
7175                        + f", profile {default_profile}"
7176                    )
7177                    self.get_header().infos[field_ID] = vcf.parser._Info(
7178                        field_ID,
7179                        PZfields_INFOS[field]["Number"],
7180                        PZfields_INFOS[field]["Type"],
7181                        field_description,
7182                        "unknown",
7183                        "unknown",
7184                        code_type_map[PZfields_INFOS[field]["Type"]],
7185                    )
7186
7187            # Create INFO fields if not exist for each profile
7188            for profile in prioritizations_config:
7189                if profile in profiles or profiles == []:
7190                    for field in PZfields_INFOS:
7191                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
7192                        field_description = (
7193                            PZfields_INFOS[field]["Description"]
7194                            + f", profile {profile}"
7195                        )
7196                        if (
7197                            field_ID not in self.get_header().infos
7198                            and field in pzfields
7199                        ):
7200                            self.get_header().infos[field_ID] = vcf.parser._Info(
7201                                field_ID,
7202                                PZfields_INFOS[field]["Number"],
7203                                PZfields_INFOS[field]["Type"],
7204                                field_description,
7205                                "unknown",
7206                                "unknown",
7207                                code_type_map[PZfields_INFOS[field]["Type"]],
7208                            )
7209
7210            # Header
7211            for pzfield in list_of_pzfields:
7212                if re.match(f"{pz_prefix}Score.*", pzfield):
7213                    added_column = self.add_column(
7214                        table_name=table_variants,
7215                        column_name=pzfield,
7216                        column_type="INTEGER",
7217                        default_value="0",
7218                    )
7219                elif re.match(f"{pz_prefix}Flag.*", pzfield):
7220                    added_column = self.add_column(
7221                        table_name=table_variants,
7222                        column_name=pzfield,
7223                        column_type="BOOLEAN",
7224                        default_value="1",
7225                    )
7226                elif re.match(f"{pz_prefix}Class.*", pzfield):
7227                    added_column = self.add_column(
7228                        table_name=table_variants,
7229                        column_name=pzfield,
7230                        column_type="VARCHAR[]",
7231                        default_value="null",
7232                    )
7233                else:
7234                    added_column = self.add_column(
7235                        table_name=table_variants,
7236                        column_name=pzfield,
7237                        column_type="STRING",
7238                        default_value="''",
7239                    )
7240                added_columns.append(added_column)
7241
7242            # Profiles
7243            if profiles:
7244
7245                # foreach profile in configuration file
7246                for profile in prioritizations_config:
7247
7248                    # If profile is asked in param, or ALL are asked (empty profile [])
7249                    if profile in profiles or profiles == []:
7250                        log.info(f"Profile '{profile}'")
7251
7252                        sql_set_info_option = ""
7253
7254                        sql_set_info = []
7255
7256                        # PZ fields set
7257
7258                        # PZScore
7259                        if (
7260                            f"{pz_prefix}Score{pzfields_sep}{profile}"
7261                            in list_of_pzfields
7262                        ):
7263                            sql_set_info.append(
7264                                f"""
7265                                    concat(
7266                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
7267                                        {pz_prefix}Score{pzfields_sep}{profile}
7268                                    ) 
7269                                """
7270                            )
7271                            if (
7272                                profile == default_profile
7273                                and f"{pz_prefix}Score" in list_of_pzfields
7274                            ):
7275                                sql_set_info.append(
7276                                    f"""
7277                                        concat(
7278                                            '{pz_prefix}Score=',
7279                                            {pz_prefix}Score{pzfields_sep}{profile}
7280                                        )
7281                                    """
7282                                )
7283
7284                        # PZFlag
7285                        if (
7286                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
7287                            in list_of_pzfields
7288                        ):
7289                            sql_set_info.append(
7290                                f"""
7291                                    concat(
7292                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
7293                                        CASE 
7294                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7295                                            THEN 'PASS'
7296                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7297                                            THEN 'FILTERED'
7298                                        END
7299                                    ) 
7300                                """
7301                            )
7302                            if (
7303                                profile == default_profile
7304                                and f"{pz_prefix}Flag" in list_of_pzfields
7305                            ):
7306                                sql_set_info.append(
7307                                    f"""
7308                                        concat(
7309                                            '{pz_prefix}Flag=',
7310                                            CASE 
7311                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7312                                                THEN 'PASS'
7313                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7314                                                THEN 'FILTERED'
7315                                            END
7316                                        )
7317                                    """
7318                                )
7319
7320                        # PZClass
7321                        if (
7322                            f"{pz_prefix}Class{pzfields_sep}{profile}"
7323                            in list_of_pzfields
7324                        ):
7325                            sql_set_info.append(
7326                                f"""
7327                                    concat(
7328                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
7329                                        CASE
7330                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7331                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7332                                            ELSE '.'
7333                                        END 
7334                                    )
7335                                    
7336                                """
7337                            )
7338                            if (
7339                                profile == default_profile
7340                                and f"{pz_prefix}Class" in list_of_pzfields
7341                            ):
7342                                sql_set_info.append(
7343                                    f"""
7344                                        concat(
7345                                            '{pz_prefix}Class=',
7346                                            CASE
7347                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7348                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7349                                                ELSE '.'
7350                                            END 
7351                                        )
7352                                    """
7353                                )
7354
7355                        # PZComment
7356                        if (
7357                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
7358                            in list_of_pzfields
7359                        ):
7360                            sql_set_info.append(
7361                                f"""
7362                                    CASE
7363                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7364                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
7365                                        ELSE ''
7366                                    END
7367                                """
7368                            )
7369                            if (
7370                                profile == default_profile
7371                                and f"{pz_prefix}Comment" in list_of_pzfields
7372                            ):
7373                                sql_set_info.append(
7374                                    f"""
7375                                        CASE
7376                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7377                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
7378                                            ELSE ''
7379                                        END
7380                                    """
7381                                )
7382
7383                        # PZInfos
7384                        if (
7385                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
7386                            in list_of_pzfields
7387                        ):
7388                            sql_set_info.append(
7389                                f"""
7390                                    CASE
7391                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7392                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
7393                                        ELSE ''
7394                                    END
7395                                """
7396                            )
7397                            if (
7398                                profile == default_profile
7399                                and f"{pz_prefix}Infos" in list_of_pzfields
7400                            ):
7401                                sql_set_info.append(
7402                                    f"""
7403                                        CASE
7404                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7405                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
7406                                            ELSE ''
7407                                        END
7408                                    """
7409                                )
7410
7411                        # Merge PZfields
7412                        sql_set_info_option = ""
7413                        sql_set_sep = ""
7414                        for sql_set in sql_set_info:
7415                            if sql_set_sep:
7416                                sql_set_info_option += f"""
7417                                    , concat('{sql_set_sep}', {sql_set})
7418                                """
7419                            else:
7420                                sql_set_info_option += f"""
7421                                    , {sql_set}
7422                                """
7423                            sql_set_sep = ";"
7424
7425                        sql_queries = []
7426                        for annotation in prioritizations_config[profile]:
7427
7428                            # skip special sections
7429                            if annotation.startswith("_"):
7430                                continue
7431
7432                            # For each criterions
7433                            for criterion in prioritizations_config[profile][
7434                                annotation
7435                            ]:
7436
7437                                # Criterion mode
7438                                criterion_mode = None
7439                                if np.any(
7440                                    np.isin(list(criterion.keys()), ["type", "value"])
7441                                ):
7442                                    criterion_mode = "operation"
7443                                elif np.any(
7444                                    np.isin(list(criterion.keys()), ["sql", "fields"])
7445                                ):
7446                                    criterion_mode = "sql"
7447                                log.debug(f"Criterion Mode: {criterion_mode}")
7448
7449                                # Criterion parameters
7450                                criterion_type = criterion.get("type", None)
7451                                criterion_value = criterion.get("value", None)
7452                                criterion_sql = criterion.get("sql", None)
7453                                criterion_fields = criterion.get("fields", None)
7454                                criterion_score = criterion.get("score", 0)
7455                                criterion_flag = criterion.get("flag", "PASS")
7456                                criterion_class = criterion.get("class", None)
7457                                criterion_flag_bool = criterion_flag == "PASS"
7458                                criterion_comment = (
7459                                    ", ".join(criterion.get("comment", []))
7460                                    .replace("'", "''")
7461                                    .replace(";", ",")
7462                                    .replace("\t", " ")
7463                                )
7464                                criterion_infos = (
7465                                    str(criterion)
7466                                    .replace("'", "''")
7467                                    .replace(";", ",")
7468                                    .replace("\t", " ")
7469                                )
7470
7471                                # SQL
7472                                if criterion_sql is not None and isinstance(
7473                                    criterion_sql, list
7474                                ):
7475                                    criterion_sql = " ".join(criterion_sql)
7476
7477                                # Fields and explode
7478                                if criterion_fields is None:
7479                                    criterion_fields = [annotation]
7480                                if not isinstance(criterion_fields, list):
7481                                    criterion_fields = str(criterion_fields).split(",")
7482
7483                                # Class
7484                                if criterion_class is not None and not isinstance(
7485                                    criterion_class, list
7486                                ):
7487                                    criterion_class = str(criterion_class).split(",")
7488
7489                                for annotation_field in criterion_fields:
7490
7491                                    # Explode specific annotation
7492                                    log.debug(
7493                                        f"Explode annotation '{annotation_field}'"
7494                                    )
7495                                    added_columns += self.explode_infos(
7496                                        prefix=explode_infos_prefix,
7497                                        fields=[annotation_field],
7498                                        table=table_variants,
7499                                    )
7500                                    extra_infos = self.get_extra_infos(
7501                                        table=table_variants
7502                                    )
7503
7504                                    # Check if annotation field is present
7505                                    if (
7506                                        f"{explode_infos_prefix}{annotation_field}"
7507                                        not in extra_infos
7508                                    ):
7509                                        msq_err = f"Annotation '{annotation_field}' not in data"
7510                                        log.error(msq_err)
7511                                        raise ValueError(msq_err)
7512                                    else:
7513                                        log.debug(
7514                                            f"Annotation '{annotation_field}' in data"
7515                                        )
7516
7517                                sql_set = []
7518                                sql_set_info = []
7519
7520                                # PZ fields set
7521
7522                                # PZScore
7523                                if (
7524                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7525                                    in list_of_pzfields
7526                                ):
7527                                    # if prioritization_score_mode == "HOWARD":
7528                                    #     sql_set.append(
7529                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7530                                    #     )
7531                                    # VaRank prioritization score mode
7532                                    if prioritization_score_mode == "VaRank":
7533                                        sql_set.append(
7534                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
7535                                        )
7536                                    # default HOWARD prioritization score mode
7537                                    else:
7538                                        sql_set.append(
7539                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7540                                        )
7541
7542                                # PZFlag
7543                                if (
7544                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7545                                    in list_of_pzfields
7546                                ):
7547                                    sql_set.append(
7548                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7549                                    )
7550
7551                                # PZClass
7552                                if (
7553                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
7554                                    in list_of_pzfields
7555                                    and criterion_class is not None
7556                                ):
7557                                    sql_set.append(
7558                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
7559                                    )
7560
7561                                # PZComment
7562                                if (
7563                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7564                                    in list_of_pzfields
7565                                ):
7566                                    sql_set.append(
7567                                        f"""
7568                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7569                                                concat(
7570                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7571                                                    CASE 
7572                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7573                                                        THEN ', '
7574                                                        ELSE ''
7575                                                    END,
7576                                                    '{criterion_comment}'
7577                                                )
7578                                        """
7579                                    )
7580
7581                                # PZInfos
7582                                if (
7583                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7584                                    in list_of_pzfields
7585                                ):
7586                                    sql_set.append(
7587                                        f"""
7588                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7589                                                concat(
7590                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7591                                                    '{criterion_infos}'
7592                                                )
7593                                        """
7594                                    )
7595                                sql_set_option = ",".join(sql_set)
7596
7597                                # Criterion and comparison
7598                                if sql_set_option:
7599
7600                                    if criterion_mode in ["operation"]:
7601
7602                                        try:
7603                                            float(criterion_value)
7604                                            sql_update = f"""
7605                                                UPDATE {table_variants}
7606                                                SET {sql_set_option}
7607                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7608                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7609                                            """
7610                                        except:
7611                                            contains_option = ""
7612                                            if criterion_type == "contains":
7613                                                contains_option = ".*"
7614                                            sql_update = f"""
7615                                                UPDATE {table_variants}
7616                                                SET {sql_set_option}
7617                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7618                                            """
7619                                        sql_queries.append(sql_update)
7620
7621                                    elif criterion_mode in ["sql"]:
7622
7623                                        sql_update = f"""
7624                                            UPDATE {table_variants}
7625                                            SET {sql_set_option}
7626                                            WHERE {criterion_sql}
7627                                        """
7628                                        sql_queries.append(sql_update)
7629
7630                                    else:
7631                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
7632                                        log.error(msg_err)
7633                                        raise ValueError(msg_err)
7634
7635                                else:
7636                                    log.warning(
7637                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7638                                    )
7639
7640                        # PZTags
7641                        if (
7642                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7643                            in list_of_pzfields
7644                        ):
7645
7646                            # Create PZFalgs value
7647                            pztags_value = ""
7648                            pztags_sep_default = ","
7649                            pztags_sep = ""
7650                            for pzfield in pzfields:
7651                                if pzfield not in [f"{pz_prefix}Tags"]:
7652                                    if (
7653                                        f"{pzfield}{pzfields_sep}{profile}"
7654                                        in list_of_pzfields
7655                                    ):
7656                                        if pzfield in [f"{pz_prefix}Flag"]:
7657                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7658                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7659                                                    THEN 'PASS'
7660                                                    ELSE 'FILTERED'
7661                                                END, '"""
7662                                        elif pzfield in [f"{pz_prefix}Class"]:
7663                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7664                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7665                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7666                                                    ELSE '.'
7667                                                END, '"""
7668                                        else:
7669                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7670                                        pztags_sep = pztags_sep_default
7671
7672                            # Add Query update for PZFlags
7673                            sql_update_pztags = f"""
7674                                UPDATE {table_variants}
7675                                SET INFO = concat(
7676                                        INFO,
7677                                        CASE WHEN INFO NOT in ('','.')
7678                                                THEN ';'
7679                                                ELSE ''
7680                                        END,
7681                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7682                                    )
7683                                """
7684                            sql_queries.append(sql_update_pztags)
7685
7686                            # Add Query update for PZFlags for default
7687                            if profile == default_profile:
7688                                sql_update_pztags_default = f"""
7689                                UPDATE {table_variants}
7690                                SET INFO = concat(
7691                                        INFO,
7692                                        ';',
7693                                        '{pz_prefix}Tags={pztags_value}'
7694                                    )
7695                                """
7696                                sql_queries.append(sql_update_pztags_default)
7697
7698                        log.info(f"""Profile '{profile}' - Prioritization... """)
7699
7700                        if sql_queries:
7701
7702                            for sql_query in sql_queries:
7703                                log.debug(
7704                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7705                                )
7706                                self.conn.execute(sql_query)
7707
7708                        log.info(f"""Profile '{profile}' - Update... """)
7709                        sql_query_update = f"""
7710                            UPDATE {table_variants}
7711                            SET INFO =  
7712                                concat(
7713                                    CASE
7714                                        WHEN INFO NOT IN ('','.')
7715                                        THEN concat(INFO, ';')
7716                                        ELSE ''
7717                                    END
7718                                    {sql_set_info_option}
7719                                )
7720                        """
7721                        self.conn.execute(sql_query_update)
7722
7723        else:
7724
7725            log.warning(f"No profiles in parameters")
7726
7727        # Remove added columns
7728        for added_column in added_columns:
7729            self.drop_column(column=added_column)
7730
7731        # Explode INFOS fields into table fields
7732        if self.get_explode_infos():
7733            self.explode_infos(
7734                prefix=self.get_explode_infos_prefix(),
7735                fields=self.get_explode_infos_fields(),
7736                force=True,
7737            )
7738
7739        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7745    def annotation_hgvs(self, threads: int = None) -> None:
7746        """
7747        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7748        coordinates and alleles.
7749
7750        :param threads: The `threads` parameter is an optional integer that specifies the number of
7751        threads to use for parallel processing. If no value is provided, it will default to the number
7752        of threads obtained from the `get_threads()` method
7753        :type threads: int
7754        """
7755
7756        # Function for each partition of the Dask Dataframe
7757        def partition_function(partition):
7758            """
7759            The function `partition_function` applies the `annotation_hgvs_partition` function to
7760            each row of a DataFrame called `partition`.
7761
7762            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7763            to be processed
7764            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7765            the "partition" dataframe along the axis 1.
7766            """
7767            return partition.apply(annotation_hgvs_partition, axis=1)
7768
7769        def annotation_hgvs_partition(row) -> str:
7770            """
7771            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7772            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7773
7774            :param row: A dictionary-like object that contains the values for the following keys:
7775            :return: a string that contains the HGVS names associated with the given row of data.
7776            """
7777
7778            chr = row["CHROM"]
7779            pos = row["POS"]
7780            ref = row["REF"]
7781            alt = row["ALT"]
7782
7783            # Find list of associated transcripts
7784            transcripts_list = list(
7785                polars_conn.execute(
7786                    f"""
7787                SELECT transcript
7788                FROM refseq_df
7789                WHERE CHROM='{chr}'
7790                AND POS={pos}
7791            """
7792                )["transcript"]
7793            )
7794
7795            # Full HGVS annotation in list
7796            hgvs_full_list = []
7797
7798            for transcript_name in transcripts_list:
7799
7800                # Transcript
7801                transcript = get_transcript(
7802                    transcripts=transcripts, transcript_name=transcript_name
7803                )
7804                # Exon
7805                if use_exon:
7806                    exon = transcript.find_exon_number(pos)
7807                else:
7808                    exon = None
7809                # Protein
7810                transcript_protein = None
7811                if use_protein or add_protein or full_format:
7812                    transcripts_protein = list(
7813                        polars_conn.execute(
7814                            f"""
7815                        SELECT protein
7816                        FROM refseqlink_df
7817                        WHERE transcript='{transcript_name}'
7818                        LIMIT 1
7819                    """
7820                        )["protein"]
7821                    )
7822                    if len(transcripts_protein):
7823                        transcript_protein = transcripts_protein[0]
7824
7825                # HGVS name
7826                hgvs_name = format_hgvs_name(
7827                    chr,
7828                    pos,
7829                    ref,
7830                    alt,
7831                    genome=genome,
7832                    transcript=transcript,
7833                    transcript_protein=transcript_protein,
7834                    exon=exon,
7835                    use_gene=use_gene,
7836                    use_protein=use_protein,
7837                    full_format=full_format,
7838                    use_version=use_version,
7839                    codon_type=codon_type,
7840                )
7841                hgvs_full_list.append(hgvs_name)
7842                if add_protein and not use_protein and not full_format:
7843                    hgvs_name = format_hgvs_name(
7844                        chr,
7845                        pos,
7846                        ref,
7847                        alt,
7848                        genome=genome,
7849                        transcript=transcript,
7850                        transcript_protein=transcript_protein,
7851                        exon=exon,
7852                        use_gene=use_gene,
7853                        use_protein=True,
7854                        full_format=False,
7855                        use_version=use_version,
7856                        codon_type=codon_type,
7857                    )
7858                    hgvs_full_list.append(hgvs_name)
7859
7860            # Create liste of HGVS annotations
7861            hgvs_full = ",".join(hgvs_full_list)
7862
7863            return hgvs_full
7864
7865        # Polars connexion
7866        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7867
7868        # Config
7869        config = self.get_config()
7870
7871        # Databases
7872        # Genome
7873        databases_genomes_folders = (
7874            config.get("folders", {})
7875            .get("databases", {})
7876            .get("genomes", DEFAULT_GENOME_FOLDER)
7877        )
7878        databases_genome = (
7879            config.get("folders", {}).get("databases", {}).get("genomes", "")
7880        )
7881        # refseq database folder
7882        databases_refseq_folders = (
7883            config.get("folders", {})
7884            .get("databases", {})
7885            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7886        )
7887        # refseq
7888        databases_refseq = config.get("databases", {}).get("refSeq", None)
7889        # refSeqLink
7890        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7891
7892        # Param
7893        param = self.get_param()
7894
7895        # Quick HGVS
7896        if "hgvs_options" in param and param.get("hgvs_options", ""):
7897            log.info(f"Quick HGVS Annotation:")
7898            if not param.get("hgvs", None):
7899                param["hgvs"] = {}
7900            for option in param.get("hgvs_options", "").split(","):
7901                option_var_val = option.split("=")
7902                option_var = option_var_val[0]
7903                if len(option_var_val) > 1:
7904                    option_val = option_var_val[1]
7905                else:
7906                    option_val = "True"
7907                if option_val.upper() in ["TRUE"]:
7908                    option_val = True
7909                elif option_val.upper() in ["FALSE"]:
7910                    option_val = False
7911                log.info(f"   {option_var}={option_val}")
7912                param["hgvs"][option_var] = option_val
7913
7914        # Check if HGVS annotation enabled
7915        if "hgvs" in param:
7916            log.info(f"HGVS Annotation... ")
7917            for hgvs_option in param.get("hgvs", {}):
7918                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7919        else:
7920            return
7921
7922        # HGVS Param
7923        param_hgvs = param.get("hgvs", {})
7924        use_exon = param_hgvs.get("use_exon", False)
7925        use_gene = param_hgvs.get("use_gene", False)
7926        use_protein = param_hgvs.get("use_protein", False)
7927        add_protein = param_hgvs.get("add_protein", False)
7928        full_format = param_hgvs.get("full_format", False)
7929        use_version = param_hgvs.get("use_version", False)
7930        codon_type = param_hgvs.get("codon_type", "3")
7931
7932        # refSseq refSeqLink
7933        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7934        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7935
7936        # Assembly
7937        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7938
7939        # Genome
7940        genome_file = None
7941        if find_genome(databases_genome):
7942            genome_file = find_genome(databases_genome)
7943        else:
7944            genome_file = find_genome(
7945                genome_path=databases_genomes_folders, assembly=assembly
7946            )
7947        log.debug("Genome: " + str(genome_file))
7948
7949        # refSseq
7950        refseq_file = find_file_prefix(
7951            input_file=databases_refseq,
7952            prefix="ncbiRefSeq",
7953            folder=databases_refseq_folders,
7954            assembly=assembly,
7955        )
7956        log.debug("refSeq: " + str(refseq_file))
7957
7958        # refSeqLink
7959        refseqlink_file = find_file_prefix(
7960            input_file=databases_refseqlink,
7961            prefix="ncbiRefSeqLink",
7962            folder=databases_refseq_folders,
7963            assembly=assembly,
7964        )
7965        log.debug("refSeqLink: " + str(refseqlink_file))
7966
7967        # Threads
7968        if not threads:
7969            threads = self.get_threads()
7970        log.debug("Threads: " + str(threads))
7971
7972        # Variables
7973        table_variants = self.get_table_variants(clause="update")
7974
7975        # Get variants SNV and InDel only
7976        query_variants = f"""
7977            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7978            FROM {table_variants}
7979            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7980            """
7981        df_variants = self.get_query_to_df(query_variants)
7982
7983        # Added columns
7984        added_columns = []
7985
7986        # Add hgvs column in variants table
7987        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7988        added_column = self.add_column(
7989            table_variants, hgvs_column_name, "STRING", default_value=None
7990        )
7991        added_columns.append(added_column)
7992
7993        log.debug(f"refSeq loading...")
7994        # refSeq in duckDB
7995        refseq_table = get_refseq_table(
7996            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7997        )
7998        # Loading all refSeq in Dataframe
7999        refseq_query = f"""
8000            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
8001            FROM {refseq_table}
8002            JOIN df_variants ON (
8003                {refseq_table}.chrom = df_variants.CHROM
8004                AND {refseq_table}.txStart<=df_variants.POS
8005                AND {refseq_table}.txEnd>=df_variants.POS
8006            )
8007        """
8008        refseq_df = self.conn.query(refseq_query).pl()
8009
8010        if refseqlink_file:
8011            log.debug(f"refSeqLink loading...")
8012            # refSeqLink in duckDB
8013            refseqlink_table = get_refseq_table(
8014                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
8015            )
8016            # Loading all refSeqLink in Dataframe
8017            protacc_column = "protAcc_with_ver"
8018            mrnaacc_column = "mrnaAcc_with_ver"
8019            refseqlink_query = f"""
8020                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
8021                FROM {refseqlink_table} 
8022                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
8023                WHERE protAcc_without_ver IS NOT NULL
8024            """
8025            # Polars Dataframe
8026            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
8027
8028        # Read RefSeq transcripts into a python dict/model.
8029        log.debug(f"Transcripts loading...")
8030        with tempfile.TemporaryDirectory() as tmpdir:
8031            transcripts_query = f"""
8032                COPY (
8033                    SELECT {refseq_table}.*
8034                    FROM {refseq_table}
8035                    JOIN df_variants ON (
8036                        {refseq_table}.chrom=df_variants.CHROM
8037                        AND {refseq_table}.txStart<=df_variants.POS
8038                        AND {refseq_table}.txEnd>=df_variants.POS
8039                    )
8040                )
8041                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
8042            """
8043            self.conn.query(transcripts_query)
8044            with open(f"{tmpdir}/transcript.tsv") as infile:
8045                transcripts = read_transcripts(infile)
8046
8047        # Polars connexion
8048        polars_conn = pl.SQLContext(register_globals=True, eager=True)
8049
8050        log.debug("Genome loading...")
8051        # Read genome sequence using pyfaidx.
8052        genome = Fasta(genome_file)
8053
8054        log.debug("Start annotation HGVS...")
8055
8056        # Create
8057        # a Dask Dataframe from Pandas dataframe with partition as number of threads
8058        ddf = dd.from_pandas(df_variants, npartitions=threads)
8059
8060        # Use dask.dataframe.apply() to apply function on each partition
8061        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
8062
8063        # Convert Dask DataFrame to Pandas Dataframe
8064        df = ddf.compute()
8065
8066        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
8067        with tempfile.TemporaryDirectory() as tmpdir:
8068            df_parquet = os.path.join(tmpdir, "df.parquet")
8069            df.to_parquet(df_parquet)
8070
8071            # Update hgvs column
8072            update_variant_query = f"""
8073                UPDATE {table_variants}
8074                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
8075                FROM read_parquet('{df_parquet}') as df
8076                WHERE variants."#CHROM" = df.CHROM
8077                AND variants.POS = df.POS
8078                AND variants.REF = df.REF
8079                AND variants.ALT = df.ALT
8080                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
8081                """
8082            self.execute_query(update_variant_query)
8083
8084        # Update INFO column
8085        sql_query_update = f"""
8086            UPDATE {table_variants}
8087            SET INFO = 
8088                concat(
8089                    CASE 
8090                        WHEN INFO NOT IN ('','.')
8091                        THEN concat(INFO, ';')
8092                        ELSE ''
8093                    END,
8094                    'hgvs=',
8095                    {hgvs_column_name}
8096                )
8097            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
8098            """
8099        self.execute_query(sql_query_update)
8100
8101        # Add header
8102        HGVS_INFOS = {
8103            "hgvs": {
8104                "ID": "hgvs",
8105                "Number": ".",
8106                "Type": "String",
8107                "Description": f"HGVS annotatation with HOWARD",
8108            }
8109        }
8110
8111        for field in HGVS_INFOS:
8112            field_ID = HGVS_INFOS[field]["ID"]
8113            field_description = HGVS_INFOS[field]["Description"]
8114            self.get_header().infos[field_ID] = vcf.parser._Info(
8115                field_ID,
8116                HGVS_INFOS[field]["Number"],
8117                HGVS_INFOS[field]["Type"],
8118                field_description,
8119                "unknown",
8120                "unknown",
8121                code_type_map[HGVS_INFOS[field]["Type"]],
8122            )
8123
8124        # Remove added columns
8125        for added_column in added_columns:
8126            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
8132    def get_operations_help(
8133        self, operations_config_dict: dict = {}, operations_config_file: str = None
8134    ) -> list:
8135
8136        # Init
8137        operations_help = []
8138
8139        # operations
8140        operations = self.get_config_json(
8141            name="calculations",
8142            config_dict=operations_config_dict,
8143            config_file=operations_config_file,
8144        )
8145        for op in operations:
8146            op_name = operations[op].get("name", op).upper()
8147            op_description = operations[op].get("description", op_name)
8148            op_available = operations[op].get("available", False)
8149            if op_available:
8150                operations_help.append(f"   {op_name}: {op_description}")
8151
8152        # Sort operations
8153        operations_help.sort()
8154
8155        # insert header
8156        operations_help.insert(0, "Available calculation operations:")
8157
8158        # Return
8159        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
8161    def calculation(
8162        self,
8163        operations: dict = {},
8164        operations_config_dict: dict = {},
8165        operations_config_file: str = None,
8166    ) -> None:
8167        """
8168        It takes a list of operations, and for each operation, it checks if it's a python or sql
8169        operation, and then calls the appropriate function
8170
8171        param json example:
8172            "calculation": {
8173                "NOMEN": {
8174                    "options": {
8175                        "hgvs_field": "hgvs"
8176                    },
8177                "middle" : null
8178            }
8179        """
8180
8181        # Param
8182        param = self.get_param()
8183
8184        # operations config
8185        operations_config = self.get_config_json(
8186            name="calculations",
8187            config_dict=operations_config_dict,
8188            config_file=operations_config_file,
8189        )
8190
8191        # Upper keys
8192        operations_config = {k.upper(): v for k, v in operations_config.items()}
8193
8194        # Calculations
8195
8196        # Operations from param
8197        operations = param.get("calculation", {}).get("calculations", operations)
8198
8199        # Quick calculation - add
8200        if param.get("calculations", None):
8201
8202            # List of operations
8203            calculations_list = [
8204                value.strip() for value in param.get("calculations", "").split(",")
8205            ]
8206
8207            # Log
8208            log.info(f"Quick Calculations:")
8209            for calculation_key in calculations_list:
8210                log.info(f"   {calculation_key}")
8211
8212            # Create tmp operations (to keep operation order)
8213            operations_tmp = {}
8214            for calculation_operation in calculations_list:
8215                if calculation_operation.upper() not in operations_tmp:
8216                    log.debug(
8217                        f"{calculation_operation}.upper() not in {operations_tmp}"
8218                    )
8219                    operations_tmp[calculation_operation.upper()] = {}
8220                    add_value_into_dict(
8221                        dict_tree=operations_tmp,
8222                        sections=[
8223                            calculation_operation.upper(),
8224                        ],
8225                        value=operations.get(calculation_operation.upper(), {}),
8226                    )
8227            # Add operations already in param
8228            for calculation_operation in operations:
8229                if calculation_operation not in operations_tmp:
8230                    operations_tmp[calculation_operation] = operations.get(
8231                        calculation_operation, {}
8232                    )
8233
8234            # Update operations in param
8235            operations = operations_tmp
8236
8237        # Operations for calculation
8238        if not operations:
8239            operations = param.get("calculation", {}).get("calculations", {})
8240
8241        if operations:
8242            log.info(f"Calculations...")
8243
8244        # For each operations
8245        for operation_name in operations:
8246            operation_name = operation_name.upper()
8247            if operation_name not in [""]:
8248                if operation_name in operations_config:
8249                    log.info(f"Calculation '{operation_name}'")
8250                    operation = operations_config[operation_name]
8251                    operation_type = operation.get("type", "sql")
8252                    if operation_type == "python":
8253                        self.calculation_process_function(
8254                            operation=operation, operation_name=operation_name
8255                        )
8256                    elif operation_type == "sql":
8257                        self.calculation_process_sql(
8258                            operation=operation, operation_name=operation_name
8259                        )
8260                    else:
8261                        log.error(
8262                            f"Operations config: Type '{operation_type}' NOT available"
8263                        )
8264                        raise ValueError(
8265                            f"Operations config: Type '{operation_type}' NOT available"
8266                        )
8267                else:
8268                    log.error(
8269                        f"Operations config: Calculation '{operation_name}' NOT available"
8270                    )
8271                    raise ValueError(
8272                        f"Operations config: Calculation '{operation_name}' NOT available"
8273                    )
8274
8275        # Explode INFOS fields into table fields
8276        if self.get_explode_infos():
8277            self.explode_infos(
8278                prefix=self.get_explode_infos_prefix(),
8279                fields=self.get_explode_infos_fields(),
8280                force=True,
8281            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
8283    def calculation_process_sql(
8284        self, operation: dict, operation_name: str = "unknown"
8285    ) -> None:
8286        """
8287        The `calculation_process_sql` function takes in a mathematical operation as a string and
8288        performs the operation, updating the specified table with the result.
8289
8290        :param operation: The `operation` parameter is a dictionary that contains information about the
8291        mathematical operation to be performed. It includes the following keys:
8292        :type operation: dict
8293        :param operation_name: The `operation_name` parameter is a string that represents the name of
8294        the mathematical operation being performed. It is used for logging and error handling purposes,
8295        defaults to unknown
8296        :type operation_name: str (optional)
8297        """
8298
8299        # table variants
8300        table_variants = self.get_table_variants(clause="alter")
8301
8302        # Operation infos
8303        operation_name = operation.get("name", "unknown")
8304        log.debug(f"process sql {operation_name}")
8305        output_column_name = operation.get("output_column_name", operation_name)
8306        output_column_type = operation.get("output_column_type", "String")
8307        prefix = operation.get("explode_infos_prefix", "")
8308        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
8309        output_column_description = operation.get(
8310            "output_column_description", f"{operation_name} operation"
8311        )
8312        operation_query = operation.get("operation_query", None)
8313        if isinstance(operation_query, list):
8314            operation_query = " ".join(operation_query)
8315        operation_info_fields = operation.get("info_fields", [])
8316        operation_info_fields_check = operation.get("info_fields_check", False)
8317        operation_info = operation.get("operation_info", True)
8318
8319        if operation_query:
8320
8321            # Info fields check
8322            operation_info_fields_check_result = True
8323            if operation_info_fields_check:
8324                header_infos = self.get_header().infos
8325                for info_field in operation_info_fields:
8326                    operation_info_fields_check_result = (
8327                        operation_info_fields_check_result
8328                        and info_field in header_infos
8329                    )
8330
8331            # If info fields available
8332            if operation_info_fields_check_result:
8333
8334                # Added_columns
8335                added_columns = []
8336
8337                # Create VCF header field
8338                vcf_reader = self.get_header()
8339                vcf_reader.infos[output_column_name] = vcf.parser._Info(
8340                    output_column_name,
8341                    ".",
8342                    output_column_type,
8343                    output_column_description,
8344                    "howard calculation",
8345                    "0",
8346                    self.code_type_map.get(output_column_type),
8347                )
8348
8349                # Explode infos if needed
8350                log.debug(f"calculation_process_sql prefix {prefix}")
8351                added_columns += self.explode_infos(
8352                    prefix=prefix,
8353                    fields=[output_column_name] + operation_info_fields,
8354                    force=True,
8355                )
8356
8357                # Create column
8358                added_column = self.add_column(
8359                    table_name=table_variants,
8360                    column_name=prefix + output_column_name,
8361                    column_type=output_column_type_sql,
8362                    default_value="null",
8363                )
8364                added_columns.append(added_column)
8365
8366                # Operation calculation
8367                try:
8368
8369                    # Query to update calculation column
8370                    sql_update = f"""
8371                        UPDATE {table_variants}
8372                        SET "{prefix}{output_column_name}" = ({operation_query})
8373                    """
8374                    self.conn.execute(sql_update)
8375
8376                    # Add to INFO
8377                    if operation_info:
8378                        sql_update_info = f"""
8379                            UPDATE {table_variants}
8380                            SET "INFO" =
8381                                concat(
8382                                    CASE
8383                                        WHEN "INFO" IS NOT NULL
8384                                        THEN concat("INFO", ';')
8385                                        ELSE ''
8386                                    END,
8387                                    '{output_column_name}=',
8388                                    "{prefix}{output_column_name}"
8389                                )
8390                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
8391                        """
8392                        self.conn.execute(sql_update_info)
8393
8394                except:
8395                    log.error(
8396                        f"Operations config: Calculation '{operation_name}' query failed"
8397                    )
8398                    raise ValueError(
8399                        f"Operations config: Calculation '{operation_name}' query failed"
8400                    )
8401
8402                # Remove added columns
8403                for added_column in added_columns:
8404                    log.debug(f"added_column: {added_column}")
8405                    self.drop_column(column=added_column)
8406
8407            else:
8408                log.error(
8409                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8410                )
8411                raise ValueError(
8412                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8413                )
8414
8415        else:
8416            log.error(
8417                f"Operations config: Calculation '{operation_name}' query NOT defined"
8418            )
8419            raise ValueError(
8420                f"Operations config: Calculation '{operation_name}' query NOT defined"
8421            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
8423    def calculation_process_function(
8424        self, operation: dict, operation_name: str = "unknown"
8425    ) -> None:
8426        """
8427        The `calculation_process_function` takes in an operation dictionary and performs the specified
8428        function with the given parameters.
8429
8430        :param operation: The `operation` parameter is a dictionary that contains information about the
8431        operation to be performed. It has the following keys:
8432        :type operation: dict
8433        :param operation_name: The `operation_name` parameter is a string that represents the name of
8434        the operation being performed. It is used for logging purposes, defaults to unknown
8435        :type operation_name: str (optional)
8436        """
8437
8438        operation_name = operation["name"]
8439        log.debug(f"process sql {operation_name}")
8440        function_name = operation["function_name"]
8441        function_params = operation["function_params"]
8442        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
8444    def calculation_variant_id(self) -> None:
8445        """
8446        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
8447        updates the INFO field of a variants table with the variant ID.
8448        """
8449
8450        # variant_id annotation field
8451        variant_id_tag = self.get_variant_id_column()
8452        added_columns = [variant_id_tag]
8453
8454        # variant_id hgvs tags"
8455        vcf_infos_tags = {
8456            variant_id_tag: "howard variant ID annotation",
8457        }
8458
8459        # Variants table
8460        table_variants = self.get_table_variants()
8461
8462        # Header
8463        vcf_reader = self.get_header()
8464
8465        # Add variant_id to header
8466        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
8467            variant_id_tag,
8468            ".",
8469            "String",
8470            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
8471            "howard calculation",
8472            "0",
8473            self.code_type_map.get("String"),
8474        )
8475
8476        # Update
8477        sql_update = f"""
8478            UPDATE {table_variants}
8479            SET "INFO" = 
8480                concat(
8481                    CASE
8482                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8483                        THEN ''
8484                        ELSE concat("INFO", ';')
8485                    END,
8486                    '{variant_id_tag}=',
8487                    "{variant_id_tag}"
8488                )
8489        """
8490        self.conn.execute(sql_update)
8491
8492        # Remove added columns
8493        for added_column in added_columns:
8494            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
8496    def calculation_extract_snpeff_hgvs(
8497        self,
8498        snpeff_hgvs: str = "snpeff_hgvs",
8499        snpeff_field: str = "ANN",
8500    ) -> None:
8501        """
8502        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
8503        annotation field in a VCF file and adds them as a new column in the variants table.
8504
8505        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
8506        function is used to specify the name of the column that will store the HGVS nomenclatures
8507        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
8508        snpeff_hgvs
8509        :type snpeff_hgvs: str (optional)
8510        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
8511        function represents the field in the VCF file that contains SnpEff annotations. This field is
8512        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
8513        to ANN
8514        :type snpeff_field: str (optional)
8515        """
8516
8517        # Snpeff hgvs tags
8518        vcf_infos_tags = {
8519            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
8520        }
8521
8522        # Prefix
8523        prefix = self.get_explode_infos_prefix()
8524        if prefix:
8525            prefix = "INFO/"
8526
8527        # snpEff fields
8528        speff_ann_infos = prefix + snpeff_field
8529        speff_hgvs_infos = prefix + snpeff_hgvs
8530
8531        # Variants table
8532        table_variants = self.get_table_variants()
8533
8534        # Header
8535        vcf_reader = self.get_header()
8536
8537        # Add columns
8538        added_columns = []
8539
8540        # Explode HGVS field in column
8541        added_columns += self.explode_infos(fields=[snpeff_field])
8542
8543        if snpeff_field in vcf_reader.infos:
8544
8545            log.debug(vcf_reader.infos[snpeff_field])
8546
8547            # Extract ANN header
8548            ann_description = vcf_reader.infos[snpeff_field].desc
8549            pattern = r"'(.+?)'"
8550            match = re.search(pattern, ann_description)
8551            if match:
8552                ann_header_match = match.group(1).split(" | ")
8553                ann_header_desc = {}
8554                for i in range(len(ann_header_match)):
8555                    ann_header_info = "".join(
8556                        char for char in ann_header_match[i] if char.isalnum()
8557                    )
8558                    ann_header_desc[ann_header_info] = ann_header_match[i]
8559                if not ann_header_desc:
8560                    raise ValueError("Invalid header description format")
8561            else:
8562                raise ValueError("Invalid header description format")
8563
8564            # Create variant id
8565            variant_id_column = self.get_variant_id_column()
8566            added_columns += [variant_id_column]
8567
8568            # Create dataframe
8569            dataframe_snpeff_hgvs = self.get_query_to_df(
8570                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8571            )
8572
8573            # Create main NOMEN column
8574            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8575                speff_ann_infos
8576            ].apply(
8577                lambda x: extract_snpeff_hgvs(
8578                    str(x), header=list(ann_header_desc.values())
8579                )
8580            )
8581
8582            # Add snpeff_hgvs to header
8583            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8584                snpeff_hgvs,
8585                ".",
8586                "String",
8587                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8588                "howard calculation",
8589                "0",
8590                self.code_type_map.get("String"),
8591            )
8592
8593            # Update
8594            sql_update = f"""
8595                UPDATE variants
8596                SET "INFO" = 
8597                    concat(
8598                        CASE
8599                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8600                            THEN ''
8601                            ELSE concat("INFO", ';')
8602                        END,
8603                        CASE 
8604                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8605                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8606                            THEN concat(
8607                                    '{snpeff_hgvs}=',
8608                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8609                                )
8610                            ELSE ''
8611                        END
8612                    )
8613                FROM dataframe_snpeff_hgvs
8614                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8615
8616            """
8617            self.conn.execute(sql_update)
8618
8619            # Delete dataframe
8620            del dataframe_snpeff_hgvs
8621            gc.collect()
8622
8623        else:
8624
8625            log.warning(
8626                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8627            )
8628
8629        # Remove added columns
8630        for added_column in added_columns:
8631            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8633    def calculation_snpeff_ann_explode(
8634        self,
8635        uniquify: bool = True,
8636        output_format: str = "fields",
8637        output_prefix: str = "snpeff_",
8638        snpeff_field: str = "ANN",
8639    ) -> None:
8640        """
8641        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8642        exploding the HGVS field and updating variant information accordingly.
8643
8644        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8645        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8646        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8647        defaults to True
8648        :type uniquify: bool (optional)
8649        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8650        function specifies the format in which the output annotations will be generated. It has a
8651        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8652        format, defaults to fields
8653        :type output_format: str (optional)
8654        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8655        method is used to specify the prefix that will be added to the output annotations generated
8656        during the calculation process. This prefix helps to differentiate the newly added annotations
8657        from existing ones in the output data. By default, the, defaults to ANN_
8658        :type output_prefix: str (optional)
8659        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8660        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8661        field will be processed to explode the HGVS annotations and update the variant information
8662        accordingly, defaults to ANN
8663        :type snpeff_field: str (optional)
8664        """
8665
8666        # SnpEff annotation field
8667        snpeff_hgvs = "snpeff_ann_explode"
8668
8669        # Snpeff hgvs tags
8670        vcf_infos_tags = {
8671            snpeff_hgvs: "Explode snpEff annotations",
8672        }
8673
8674        # Prefix
8675        prefix = self.get_explode_infos_prefix()
8676        if prefix:
8677            prefix = "INFO/"
8678
8679        # snpEff fields
8680        speff_ann_infos = prefix + snpeff_field
8681        speff_hgvs_infos = prefix + snpeff_hgvs
8682
8683        # Variants table
8684        table_variants = self.get_table_variants()
8685
8686        # Header
8687        vcf_reader = self.get_header()
8688
8689        # Add columns
8690        added_columns = []
8691
8692        # Explode HGVS field in column
8693        added_columns += self.explode_infos(fields=[snpeff_field])
8694        log.debug(f"snpeff_field={snpeff_field}")
8695        log.debug(f"added_columns={added_columns}")
8696
8697        if snpeff_field in vcf_reader.infos:
8698
8699            # Extract ANN header
8700            ann_description = vcf_reader.infos[snpeff_field].desc
8701            pattern = r"'(.+?)'"
8702            match = re.search(pattern, ann_description)
8703            if match:
8704                ann_header_match = match.group(1).split(" | ")
8705                ann_header = []
8706                ann_header_desc = {}
8707                for i in range(len(ann_header_match)):
8708                    ann_header_info = "".join(
8709                        char for char in ann_header_match[i] if char.isalnum()
8710                    )
8711                    ann_header.append(ann_header_info)
8712                    ann_header_desc[ann_header_info] = ann_header_match[i]
8713                if not ann_header_desc:
8714                    raise ValueError("Invalid header description format")
8715            else:
8716                raise ValueError("Invalid header description format")
8717
8718            # Create variant id
8719            variant_id_column = self.get_variant_id_column()
8720            added_columns += [variant_id_column]
8721
8722            # Create dataframe
8723            dataframe_snpeff_hgvs = self.get_query_to_df(
8724                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8725            )
8726
8727            # Create snpEff columns
8728            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8729                speff_ann_infos
8730            ].apply(
8731                lambda x: explode_snpeff_ann(
8732                    str(x),
8733                    uniquify=uniquify,
8734                    output_format=output_format,
8735                    prefix=output_prefix,
8736                    header=list(ann_header_desc.values()),
8737                )
8738            )
8739
8740            # Header
8741            ann_annotations_prefix = ""
8742            if output_format.upper() in ["JSON"]:
8743                ann_annotations_prefix = f"{output_prefix}="
8744                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8745                    output_prefix,
8746                    ".",
8747                    "String",
8748                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8749                    + " - JSON format",
8750                    "howard calculation",
8751                    "0",
8752                    self.code_type_map.get("String"),
8753                )
8754            else:
8755                for ann_annotation in ann_header:
8756                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8757                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8758                        ann_annotation_id,
8759                        ".",
8760                        "String",
8761                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8762                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8763                        "howard calculation",
8764                        "0",
8765                        self.code_type_map.get("String"),
8766                    )
8767
8768            # Update
8769            sql_update = f"""
8770                UPDATE variants
8771                SET "INFO" = 
8772                    concat(
8773                        CASE
8774                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8775                            THEN ''
8776                            ELSE concat("INFO", ';')
8777                        END,
8778                        CASE 
8779                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8780                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8781                            THEN concat(
8782                                '{ann_annotations_prefix}',
8783                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8784                                )
8785                            ELSE ''
8786                        END
8787                    )
8788                FROM dataframe_snpeff_hgvs
8789                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8790
8791            """
8792            self.conn.execute(sql_update)
8793
8794            # Delete dataframe
8795            del dataframe_snpeff_hgvs
8796            gc.collect()
8797
8798        else:
8799
8800            log.warning(
8801                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8802            )
8803
8804        # Remove added columns
8805        for added_column in added_columns:
8806            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8808    def calculation_extract_nomen(self) -> None:
8809        """
8810        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8811        """
8812
8813        # NOMEN field
8814        field_nomen_dict = "NOMEN_DICT"
8815
8816        # NOMEN structure
8817        nomen_dict = {
8818            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8819            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8820            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8821            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8822            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8823            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8824            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8825            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8826            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8827            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8828        }
8829
8830        # Param
8831        param = self.get_param()
8832
8833        # Prefix
8834        prefix = self.get_explode_infos_prefix()
8835
8836        # Header
8837        vcf_reader = self.get_header()
8838
8839        # Added columns
8840        added_columns = []
8841
8842        # Get HGVS field
8843        hgvs_field = (
8844            param.get("calculation", {})
8845            .get("calculations", {})
8846            .get("NOMEN", {})
8847            .get("options", {})
8848            .get("hgvs_field", "hgvs")
8849        )
8850
8851        # Get NOMEN pattern
8852        nomen_pattern = (
8853            param.get("calculation", {})
8854            .get("calculations", {})
8855            .get("NOMEN", {})
8856            .get("options", {})
8857            .get("pattern", None)
8858        )
8859
8860        # transcripts list of preference sources
8861        transcripts_sources = {}
8862
8863        # Get transcripts
8864        transcripts_file = (
8865            param.get("calculation", {})
8866            .get("calculations", {})
8867            .get("NOMEN", {})
8868            .get("options", {})
8869            .get("transcripts", None)
8870        )
8871        transcripts_file = full_path(transcripts_file)
8872        if transcripts_file:
8873            if os.path.exists(transcripts_file):
8874                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8875                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
8876                transcripts_sources["file"] = transcripts_from_file
8877            else:
8878                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
8879                log.error(msg_err)
8880                raise ValueError(msg_err)
8881
8882        # Get transcripts table
8883        transcripts_table = (
8884            param.get("calculation", {})
8885            .get("calculations", {})
8886            .get("NOMEN", {})
8887            .get("options", {})
8888            .get("transcripts_table", self.get_table_variants())
8889        )
8890        # Get transcripts column
8891        transcripts_column = (
8892            param.get("calculation", {})
8893            .get("calculations", {})
8894            .get("NOMEN", {})
8895            .get("options", {})
8896            .get("transcripts_column", None)
8897        )
8898
8899        if transcripts_table and transcripts_column:
8900            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
8901            # Explode if not exists
8902            self.explode_infos(fields=[transcripts_column], table=transcripts_table)
8903        else:
8904            extra_field_transcript = f"NULL"
8905
8906        # Transcripts of preference source order
8907        transcripts_order = (
8908            param.get("calculation", {})
8909            .get("calculations", {})
8910            .get("NOMEN", {})
8911            .get("options", {})
8912            .get("transcripts_order", ["column", "file"])
8913        )
8914
8915        # Transcripts from file
8916        transcripts = transcripts_sources.get("file", [])
8917
8918        # Explode HGVS field in column
8919        added_columns += self.explode_infos(fields=[hgvs_field])
8920
8921        # extra infos
8922        extra_infos = self.get_extra_infos()
8923        extra_field = prefix + hgvs_field
8924
8925        if extra_field in extra_infos:
8926
8927            # Create dataframe
8928            dataframe_hgvs = self.get_query_to_df(
8929                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
8930            )
8931
8932            # Create main NOMEN column
8933            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
8934                lambda x: find_nomen(
8935                    hgvs=x.hgvs,
8936                    transcript=x.transcript,
8937                    transcripts=transcripts,
8938                    pattern=nomen_pattern,
8939                    transcripts_source_order=transcripts_order,
8940                ),
8941                axis=1,
8942            )
8943
8944            # Explode NOMEN Structure and create SQL set for update
8945            sql_nomen_fields = []
8946            for nomen_field in nomen_dict:
8947
8948                # Explode each field into a column
8949                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8950                    lambda x: dict(x).get(nomen_field, "")
8951                )
8952
8953                # Create VCF header field
8954                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8955                    nomen_field,
8956                    ".",
8957                    "String",
8958                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8959                    "howard calculation",
8960                    "0",
8961                    self.code_type_map.get("String"),
8962                )
8963                sql_nomen_fields.append(
8964                    f"""
8965                        CASE 
8966                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8967                            THEN concat(
8968                                    ';{nomen_field}=',
8969                                    dataframe_hgvs."{nomen_field}"
8970                                )
8971                            ELSE ''
8972                        END
8973                    """
8974                )
8975
8976            # SQL set for update
8977            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8978
8979            # Update
8980            sql_update = f"""
8981                UPDATE variants
8982                SET "INFO" = 
8983                    concat(
8984                        CASE
8985                            WHEN "INFO" IS NULL
8986                            THEN ''
8987                            ELSE "INFO"
8988                        END,
8989                        {sql_nomen_fields_set}
8990                    )
8991                FROM dataframe_hgvs
8992                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8993                    AND variants."POS" = dataframe_hgvs."POS" 
8994                    AND variants."REF" = dataframe_hgvs."REF"
8995                    AND variants."ALT" = dataframe_hgvs."ALT"
8996            """
8997            self.conn.execute(sql_update)
8998
8999            # Delete dataframe
9000            del dataframe_hgvs
9001            gc.collect()
9002
9003        # Remove added columns
9004        for added_column in added_columns:
9005            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
9007    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
9008        """
9009        The function `calculation_find_by_pipeline` performs a calculation to find the number of
9010        pipeline/sample for a variant and updates the variant information in a VCF file.
9011
9012        :param tag: The `tag` parameter is a string that represents the annotation field for the
9013        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
9014        VCF header and to update the corresponding field in the variants table, defaults to
9015        findbypipeline
9016        :type tag: str (optional)
9017        """
9018
9019        # if FORMAT and samples
9020        if (
9021            "FORMAT" in self.get_header_columns_as_list()
9022            and self.get_header_sample_list()
9023        ):
9024
9025            # findbypipeline annotation field
9026            findbypipeline_tag = tag
9027
9028            # VCF infos tags
9029            vcf_infos_tags = {
9030                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
9031            }
9032
9033            # Prefix
9034            prefix = self.get_explode_infos_prefix()
9035
9036            # Field
9037            findbypipeline_infos = prefix + findbypipeline_tag
9038
9039            # Variants table
9040            table_variants = self.get_table_variants()
9041
9042            # Header
9043            vcf_reader = self.get_header()
9044
9045            # Create variant id
9046            variant_id_column = self.get_variant_id_column()
9047            added_columns = [variant_id_column]
9048
9049            # variant_id, FORMAT and samples
9050            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9051                self.get_header_sample_list()
9052            )
9053
9054            # Create dataframe
9055            dataframe_findbypipeline = self.get_query_to_df(
9056                f""" SELECT {samples_fields} FROM {table_variants} """
9057            )
9058
9059            # Create findbypipeline column
9060            dataframe_findbypipeline[findbypipeline_infos] = (
9061                dataframe_findbypipeline.apply(
9062                    lambda row: findbypipeline(
9063                        row, samples=self.get_header_sample_list()
9064                    ),
9065                    axis=1,
9066                )
9067            )
9068
9069            # Add snpeff_hgvs to header
9070            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
9071                findbypipeline_tag,
9072                ".",
9073                "String",
9074                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
9075                "howard calculation",
9076                "0",
9077                self.code_type_map.get("String"),
9078            )
9079
9080            # Update
9081            sql_update = f"""
9082                UPDATE variants
9083                SET "INFO" = 
9084                    concat(
9085                        CASE
9086                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9087                            THEN ''
9088                            ELSE concat("INFO", ';')
9089                        END,
9090                        CASE 
9091                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
9092                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
9093                            THEN concat(
9094                                    '{findbypipeline_tag}=',
9095                                    dataframe_findbypipeline."{findbypipeline_infos}"
9096                                )
9097                            ELSE ''
9098                        END
9099                    )
9100                FROM dataframe_findbypipeline
9101                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
9102            """
9103            self.conn.execute(sql_update)
9104
9105            # Remove added columns
9106            for added_column in added_columns:
9107                self.drop_column(column=added_column)
9108
9109            # Delete dataframe
9110            del dataframe_findbypipeline
9111            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
9113    def calculation_genotype_concordance(self) -> None:
9114        """
9115        The function `calculation_genotype_concordance` calculates the genotype concordance for
9116        multi-caller VCF files and updates the variant information in the database.
9117        """
9118
9119        # if FORMAT and samples
9120        if (
9121            "FORMAT" in self.get_header_columns_as_list()
9122            and self.get_header_sample_list()
9123        ):
9124
9125            # genotypeconcordance annotation field
9126            genotypeconcordance_tag = "genotypeconcordance"
9127
9128            # VCF infos tags
9129            vcf_infos_tags = {
9130                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
9131            }
9132
9133            # Prefix
9134            prefix = self.get_explode_infos_prefix()
9135
9136            # Field
9137            genotypeconcordance_infos = prefix + genotypeconcordance_tag
9138
9139            # Variants table
9140            table_variants = self.get_table_variants()
9141
9142            # Header
9143            vcf_reader = self.get_header()
9144
9145            # Create variant id
9146            variant_id_column = self.get_variant_id_column()
9147            added_columns = [variant_id_column]
9148
9149            # variant_id, FORMAT and samples
9150            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9151                self.get_header_sample_list()
9152            )
9153
9154            # Create dataframe
9155            dataframe_genotypeconcordance = self.get_query_to_df(
9156                f""" SELECT {samples_fields} FROM {table_variants} """
9157            )
9158
9159            # Create genotypeconcordance column
9160            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
9161                dataframe_genotypeconcordance.apply(
9162                    lambda row: genotypeconcordance(
9163                        row, samples=self.get_header_sample_list()
9164                    ),
9165                    axis=1,
9166                )
9167            )
9168
9169            # Add genotypeconcordance to header
9170            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
9171                genotypeconcordance_tag,
9172                ".",
9173                "String",
9174                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
9175                "howard calculation",
9176                "0",
9177                self.code_type_map.get("String"),
9178            )
9179
9180            # Update
9181            sql_update = f"""
9182                UPDATE variants
9183                SET "INFO" = 
9184                    concat(
9185                        CASE
9186                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9187                            THEN ''
9188                            ELSE concat("INFO", ';')
9189                        END,
9190                        CASE
9191                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
9192                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
9193                            THEN concat(
9194                                    '{genotypeconcordance_tag}=',
9195                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
9196                                )
9197                            ELSE ''
9198                        END
9199                    )
9200                FROM dataframe_genotypeconcordance
9201                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
9202            """
9203            self.conn.execute(sql_update)
9204
9205            # Remove added columns
9206            for added_column in added_columns:
9207                self.drop_column(column=added_column)
9208
9209            # Delete dataframe
9210            del dataframe_genotypeconcordance
9211            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
9213    def calculation_barcode(self, tag: str = "barcode") -> None:
9214        """
9215        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
9216        updates the INFO field in the file with the calculated barcode values.
9217
9218        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
9219        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
9220        the default tag name is set to "barcode", defaults to barcode
9221        :type tag: str (optional)
9222        """
9223
9224        # if FORMAT and samples
9225        if (
9226            "FORMAT" in self.get_header_columns_as_list()
9227            and self.get_header_sample_list()
9228        ):
9229
9230            # barcode annotation field
9231            if not tag:
9232                tag = "barcode"
9233
9234            # VCF infos tags
9235            vcf_infos_tags = {
9236                tag: "barcode calculation (VaRank)",
9237            }
9238
9239            # Prefix
9240            prefix = self.get_explode_infos_prefix()
9241
9242            # Field
9243            barcode_infos = prefix + tag
9244
9245            # Variants table
9246            table_variants = self.get_table_variants()
9247
9248            # Header
9249            vcf_reader = self.get_header()
9250
9251            # Create variant id
9252            variant_id_column = self.get_variant_id_column()
9253            added_columns = [variant_id_column]
9254
9255            # variant_id, FORMAT and samples
9256            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9257                self.get_header_sample_list()
9258            )
9259
9260            # Create dataframe
9261            dataframe_barcode = self.get_query_to_df(
9262                f""" SELECT {samples_fields} FROM {table_variants} """
9263            )
9264
9265            # Create barcode column
9266            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9267                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
9268            )
9269
9270            # Add barcode to header
9271            vcf_reader.infos[tag] = vcf.parser._Info(
9272                tag,
9273                ".",
9274                "String",
9275                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
9276                "howard calculation",
9277                "0",
9278                self.code_type_map.get("String"),
9279            )
9280
9281            # Update
9282            sql_update = f"""
9283                UPDATE {table_variants}
9284                SET "INFO" = 
9285                    concat(
9286                        CASE
9287                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9288                            THEN ''
9289                            ELSE concat("INFO", ';')
9290                        END,
9291                        CASE
9292                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
9293                            AND dataframe_barcode."{barcode_infos}" NOT NULL
9294                            THEN concat(
9295                                    '{tag}=',
9296                                    dataframe_barcode."{barcode_infos}"
9297                                )
9298                            ELSE ''
9299                        END
9300                    )
9301                FROM dataframe_barcode
9302                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9303            """
9304            self.conn.execute(sql_update)
9305
9306            # Remove added columns
9307            for added_column in added_columns:
9308                self.drop_column(column=added_column)
9309
9310            # Delete dataframe
9311            del dataframe_barcode
9312            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
9314    def calculation_barcode_family(self, tag: str = "BCF") -> None:
9315        """
9316        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
9317        and updates the INFO field in the file with the calculated barcode values.
9318
9319        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
9320        the barcode tag that will be added to the VCF file during the calculation process. If no value
9321        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
9322        :type tag: str (optional)
9323        """
9324
9325        # if FORMAT and samples
9326        if (
9327            "FORMAT" in self.get_header_columns_as_list()
9328            and self.get_header_sample_list()
9329        ):
9330
9331            # barcode annotation field
9332            if not tag:
9333                tag = "BCF"
9334
9335            # VCF infos tags
9336            vcf_infos_tags = {
9337                tag: "barcode family calculation",
9338                f"{tag}S": "barcode family samples",
9339            }
9340
9341            # Param
9342            param = self.get_param()
9343            log.debug(f"param={param}")
9344
9345            # Prefix
9346            prefix = self.get_explode_infos_prefix()
9347
9348            # PED param
9349            ped = (
9350                param.get("calculation", {})
9351                .get("calculations", {})
9352                .get("BARCODEFAMILY", {})
9353                .get("family_pedigree", None)
9354            )
9355            log.debug(f"ped={ped}")
9356
9357            # Load PED
9358            if ped:
9359
9360                # Pedigree is a file
9361                if isinstance(ped, str) and os.path.exists(full_path(ped)):
9362                    log.debug("Pedigree is file")
9363                    with open(full_path(ped)) as ped:
9364                        ped = json.load(ped)
9365
9366                # Pedigree is a string
9367                elif isinstance(ped, str):
9368                    log.debug("Pedigree is str")
9369                    try:
9370                        ped = json.loads(ped)
9371                        log.debug("Pedigree is json str")
9372                    except ValueError as e:
9373                        ped_samples = ped.split(",")
9374                        ped = {}
9375                        for ped_sample in ped_samples:
9376                            ped[ped_sample] = ped_sample
9377
9378                # Pedigree is a dict
9379                elif isinstance(ped, dict):
9380                    log.debug("Pedigree is dict")
9381
9382                # Pedigree is not well formatted
9383                else:
9384                    msg_error = "Pedigree not well formatted"
9385                    log.error(msg_error)
9386                    raise ValueError(msg_error)
9387
9388                # Construct list
9389                ped_samples = list(ped.values())
9390
9391            else:
9392                log.debug("Pedigree not defined. Take all samples")
9393                ped_samples = self.get_header_sample_list()
9394                ped = {}
9395                for ped_sample in ped_samples:
9396                    ped[ped_sample] = ped_sample
9397
9398            # Check pedigree
9399            if not ped or len(ped) == 0:
9400                msg_error = f"Error in pedigree: samples {ped_samples}"
9401                log.error(msg_error)
9402                raise ValueError(msg_error)
9403
9404            # Log
9405            log.info(
9406                "Calculation 'BARCODEFAMILY' - Samples: "
9407                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
9408            )
9409            log.debug(f"ped_samples={ped_samples}")
9410
9411            # Field
9412            barcode_infos = prefix + tag
9413
9414            # Variants table
9415            table_variants = self.get_table_variants()
9416
9417            # Header
9418            vcf_reader = self.get_header()
9419
9420            # Create variant id
9421            variant_id_column = self.get_variant_id_column()
9422            added_columns = [variant_id_column]
9423
9424            # variant_id, FORMAT and samples
9425            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9426                ped_samples
9427            )
9428
9429            # Create dataframe
9430            dataframe_barcode = self.get_query_to_df(
9431                f""" SELECT {samples_fields} FROM {table_variants} """
9432            )
9433
9434            # Create barcode column
9435            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9436                lambda row: barcode(row, samples=ped_samples), axis=1
9437            )
9438
9439            # Add barcode family to header
9440            # Add vaf_normalization to header
9441            vcf_reader.formats[tag] = vcf.parser._Format(
9442                id=tag,
9443                num=".",
9444                type="String",
9445                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
9446                type_code=self.code_type_map.get("String"),
9447            )
9448            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
9449                id=f"{tag}S",
9450                num=".",
9451                type="String",
9452                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
9453                type_code=self.code_type_map.get("String"),
9454            )
9455
9456            # Update
9457            # for sample in ped_samples:
9458            sql_update_set = []
9459            for sample in self.get_header_sample_list() + ["FORMAT"]:
9460                if sample in ped_samples:
9461                    value = f'dataframe_barcode."{barcode_infos}"'
9462                    value_samples = "'" + ",".join(ped_samples) + "'"
9463                elif sample == "FORMAT":
9464                    value = f"'{tag}'"
9465                    value_samples = f"'{tag}S'"
9466                else:
9467                    value = "'.'"
9468                    value_samples = "'.'"
9469                format_regex = r"[a-zA-Z0-9\s]"
9470                sql_update_set.append(
9471                    f"""
9472                        "{sample}" = 
9473                        concat(
9474                            CASE
9475                                WHEN {table_variants}."{sample}" = './.'
9476                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
9477                                ELSE {table_variants}."{sample}"
9478                            END,
9479                            ':',
9480                            {value},
9481                            ':',
9482                            {value_samples}
9483                        )
9484                    """
9485                )
9486
9487            sql_update_set_join = ", ".join(sql_update_set)
9488            sql_update = f"""
9489                UPDATE {table_variants}
9490                SET {sql_update_set_join}
9491                FROM dataframe_barcode
9492                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9493            """
9494            self.conn.execute(sql_update)
9495
9496            # Remove added columns
9497            for added_column in added_columns:
9498                self.drop_column(column=added_column)
9499
9500            # Delete dataframe
9501            del dataframe_barcode
9502            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
9504    def calculation_trio(self) -> None:
9505        """
9506        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
9507        information to the INFO field of each variant.
9508        """
9509
9510        # if FORMAT and samples
9511        if (
9512            "FORMAT" in self.get_header_columns_as_list()
9513            and self.get_header_sample_list()
9514        ):
9515
9516            # trio annotation field
9517            trio_tag = "trio"
9518
9519            # VCF infos tags
9520            vcf_infos_tags = {
9521                "trio": "trio calculation",
9522            }
9523
9524            # Param
9525            param = self.get_param()
9526
9527            # Prefix
9528            prefix = self.get_explode_infos_prefix()
9529
9530            # Trio param
9531            trio_ped = (
9532                param.get("calculation", {})
9533                .get("calculations", {})
9534                .get("TRIO", {})
9535                .get("trio_pedigree", None)
9536            )
9537
9538            # Load trio
9539            if trio_ped:
9540
9541                # Trio pedigree is a file
9542                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
9543                    log.debug("TRIO pedigree is file")
9544                    with open(full_path(trio_ped)) as trio_ped:
9545                        trio_ped = json.load(trio_ped)
9546
9547                # Trio pedigree is a string
9548                elif isinstance(trio_ped, str):
9549                    log.debug("TRIO pedigree is str")
9550                    try:
9551                        trio_ped = json.loads(trio_ped)
9552                        log.debug("TRIO pedigree is json str")
9553                    except ValueError as e:
9554                        trio_samples = trio_ped.split(",")
9555                        if len(trio_samples) == 3:
9556                            trio_ped = {
9557                                "father": trio_samples[0],
9558                                "mother": trio_samples[1],
9559                                "child": trio_samples[2],
9560                            }
9561                            log.debug("TRIO pedigree is list str")
9562                        else:
9563                            msg_error = "TRIO pedigree not well formatted"
9564                            log.error(msg_error)
9565                            raise ValueError(msg_error)
9566
9567                # Trio pedigree is a dict
9568                elif isinstance(trio_ped, dict):
9569                    log.debug("TRIO pedigree is dict")
9570
9571                # Trio pedigree is not well formatted
9572                else:
9573                    msg_error = "TRIO pedigree not well formatted"
9574                    log.error(msg_error)
9575                    raise ValueError(msg_error)
9576
9577                # Construct trio list
9578                trio_samples = [
9579                    trio_ped.get("father", ""),
9580                    trio_ped.get("mother", ""),
9581                    trio_ped.get("child", ""),
9582                ]
9583
9584            else:
9585                log.debug("TRIO pedigree not defined. Take the first 3 samples")
9586                samples_list = self.get_header_sample_list()
9587                if len(samples_list) >= 3:
9588                    trio_samples = self.get_header_sample_list()[0:3]
9589                    trio_ped = {
9590                        "father": trio_samples[0],
9591                        "mother": trio_samples[1],
9592                        "child": trio_samples[2],
9593                    }
9594                else:
9595                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
9596                    log.error(msg_error)
9597                    raise ValueError(msg_error)
9598
9599            # Check trio pedigree
9600            if not trio_ped or len(trio_ped) != 3:
9601                msg_error = f"Error in TRIO pedigree: {trio_ped}"
9602                log.error(msg_error)
9603                raise ValueError(msg_error)
9604
9605            # Log
9606            log.info(
9607                f"Calculation 'TRIO' - Samples: "
9608                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
9609            )
9610
9611            # Field
9612            trio_infos = prefix + trio_tag
9613
9614            # Variants table
9615            table_variants = self.get_table_variants()
9616
9617            # Header
9618            vcf_reader = self.get_header()
9619
9620            # Create variant id
9621            variant_id_column = self.get_variant_id_column()
9622            added_columns = [variant_id_column]
9623
9624            # variant_id, FORMAT and samples
9625            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9626                self.get_header_sample_list()
9627            )
9628
9629            # Create dataframe
9630            dataframe_trio = self.get_query_to_df(
9631                f""" SELECT {samples_fields} FROM {table_variants} """
9632            )
9633
9634            # Create trio column
9635            dataframe_trio[trio_infos] = dataframe_trio.apply(
9636                lambda row: trio(row, samples=trio_samples), axis=1
9637            )
9638
9639            # Add trio to header
9640            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9641                trio_tag,
9642                ".",
9643                "String",
9644                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9645                "howard calculation",
9646                "0",
9647                self.code_type_map.get("String"),
9648            )
9649
9650            # Update
9651            sql_update = f"""
9652                UPDATE {table_variants}
9653                SET "INFO" = 
9654                    concat(
9655                        CASE
9656                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9657                            THEN ''
9658                            ELSE concat("INFO", ';')
9659                        END,
9660                        CASE
9661                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9662                             AND dataframe_trio."{trio_infos}" NOT NULL
9663                            THEN concat(
9664                                    '{trio_tag}=',
9665                                    dataframe_trio."{trio_infos}"
9666                                )
9667                            ELSE ''
9668                        END
9669                    )
9670                FROM dataframe_trio
9671                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9672            """
9673            self.conn.execute(sql_update)
9674
9675            # Remove added columns
9676            for added_column in added_columns:
9677                self.drop_column(column=added_column)
9678
9679            # Delete dataframe
9680            del dataframe_trio
9681            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9683    def calculation_vaf_normalization(self) -> None:
9684        """
9685        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9686        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9687        :return: The function does not return anything.
9688        """
9689
9690        # if FORMAT and samples
9691        if (
9692            "FORMAT" in self.get_header_columns_as_list()
9693            and self.get_header_sample_list()
9694        ):
9695
9696            # vaf_normalization annotation field
9697            vaf_normalization_tag = "VAF"
9698
9699            # VCF infos tags
9700            vcf_infos_tags = {
9701                "VAF": "VAF Variant Frequency",
9702            }
9703
9704            # Prefix
9705            prefix = self.get_explode_infos_prefix()
9706
9707            # Variants table
9708            table_variants = self.get_table_variants()
9709
9710            # Header
9711            vcf_reader = self.get_header()
9712
9713            # Do not calculate if VAF already exists
9714            if "VAF" in vcf_reader.formats:
9715                log.debug("VAF already on genotypes")
9716                return
9717
9718            # Create variant id
9719            variant_id_column = self.get_variant_id_column()
9720            added_columns = [variant_id_column]
9721
9722            # variant_id, FORMAT and samples
9723            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9724                f""" "{sample}" """ for sample in self.get_header_sample_list()
9725            )
9726
9727            # Create dataframe
9728            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9729            log.debug(f"query={query}")
9730            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9731
9732            vaf_normalization_set = []
9733
9734            # for each sample vaf_normalization
9735            for sample in self.get_header_sample_list():
9736                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9737                    lambda row: vaf_normalization(row, sample=sample), axis=1
9738                )
9739                vaf_normalization_set.append(
9740                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9741                )
9742
9743            # Add VAF to FORMAT
9744            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9745                "FORMAT"
9746            ].apply(lambda x: str(x) + ":VAF")
9747            vaf_normalization_set.append(
9748                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9749            )
9750
9751            # Add vaf_normalization to header
9752            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9753                id=vaf_normalization_tag,
9754                num="1",
9755                type="Float",
9756                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9757                type_code=self.code_type_map.get("Float"),
9758            )
9759
9760            # Create fields to add in INFO
9761            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9762
9763            # Update
9764            sql_update = f"""
9765                UPDATE {table_variants}
9766                SET {sql_vaf_normalization_set}
9767                FROM dataframe_vaf_normalization
9768                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9769
9770            """
9771            self.conn.execute(sql_update)
9772
9773            # Remove added columns
9774            for added_column in added_columns:
9775                self.drop_column(column=added_column)
9776
9777            # Delete dataframe
9778            del dataframe_vaf_normalization
9779            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
9781    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9782        """
9783        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9784        field in a VCF file and updates the INFO column of the variants table with the calculated
9785        statistics.
9786
9787        :param info: The `info` parameter is a string that represents the type of information for which
9788        genotype statistics are calculated. It is used to generate various VCF info tags for the
9789        statistics, such as the number of occurrences, the list of values, the minimum value, the
9790        maximum value, the mean, the median, defaults to VAF
9791        :type info: str (optional)
9792        """
9793
9794        # if FORMAT and samples
9795        if (
9796            "FORMAT" in self.get_header_columns_as_list()
9797            and self.get_header_sample_list()
9798        ):
9799
9800            # vaf_stats annotation field
9801            vaf_stats_tag = info + "_stats"
9802
9803            # VCF infos tags
9804            vcf_infos_tags = {
9805                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9806                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9807                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9808                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9809                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9810                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9811                info
9812                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9813            }
9814
9815            # Prefix
9816            prefix = self.get_explode_infos_prefix()
9817
9818            # Field
9819            vaf_stats_infos = prefix + vaf_stats_tag
9820
9821            # Variants table
9822            table_variants = self.get_table_variants()
9823
9824            # Header
9825            vcf_reader = self.get_header()
9826
9827            # Create variant id
9828            variant_id_column = self.get_variant_id_column()
9829            added_columns = [variant_id_column]
9830
9831            # variant_id, FORMAT and samples
9832            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9833                self.get_header_sample_list()
9834            )
9835
9836            # Create dataframe
9837            dataframe_vaf_stats = self.get_query_to_df(
9838                f""" SELECT {samples_fields} FROM {table_variants} """
9839            )
9840
9841            # Create vaf_stats column
9842            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9843                lambda row: genotype_stats(
9844                    row, samples=self.get_header_sample_list(), info=info
9845                ),
9846                axis=1,
9847            )
9848
9849            # List of vcf tags
9850            sql_vaf_stats_fields = []
9851
9852            # Check all VAF stats infos
9853            for stat in vcf_infos_tags:
9854
9855                # Extract stats
9856                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9857                    lambda x: dict(x).get(stat, "")
9858                )
9859
9860                # Add snpeff_hgvs to header
9861                vcf_reader.infos[stat] = vcf.parser._Info(
9862                    stat,
9863                    ".",
9864                    "String",
9865                    vcf_infos_tags.get(stat, "genotype statistics"),
9866                    "howard calculation",
9867                    "0",
9868                    self.code_type_map.get("String"),
9869                )
9870
9871                if len(sql_vaf_stats_fields):
9872                    sep = ";"
9873                else:
9874                    sep = ""
9875
9876                # Create fields to add in INFO
9877                sql_vaf_stats_fields.append(
9878                    f"""
9879                        CASE
9880                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9881                            THEN concat(
9882                                    '{sep}{stat}=',
9883                                    dataframe_vaf_stats."{stat}"
9884                                )
9885                            ELSE ''
9886                        END
9887                    """
9888                )
9889
9890            # SQL set for update
9891            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9892
9893            # Update
9894            sql_update = f"""
9895                UPDATE {table_variants}
9896                SET "INFO" = 
9897                    concat(
9898                        CASE
9899                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9900                            THEN ''
9901                            ELSE concat("INFO", ';')
9902                        END,
9903                        {sql_vaf_stats_fields_set}
9904                    )
9905                FROM dataframe_vaf_stats
9906                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9907
9908            """
9909            self.conn.execute(sql_update)
9910
9911            # Remove added columns
9912            for added_column in added_columns:
9913                self.drop_column(column=added_column)
9914
9915            # Delete dataframe
9916            del dataframe_vaf_stats
9917            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_annotation(self, info_json: str = None, info_format: str = None) -> None:
9919    def calculation_transcripts_annotation(
9920        self, info_json: str = None, info_format: str = None
9921    ) -> None:
9922        """
9923        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
9924        field to it if transcripts are available.
9925
9926        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
9927        is a string parameter that represents the information field to be used in the transcripts JSON.
9928        It is used to specify the JSON format for the transcripts information. If no value is provided
9929        when calling the method, it defaults to "
9930        :type info_json: str
9931        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
9932        method is a string parameter that specifies the format of the information field to be used in
9933        the transcripts JSON. It is used to define the format of the information field
9934        :type info_format: str
9935        """
9936
9937        # Create transcripts table
9938        transcripts_table = self.create_transcript_view()
9939
9940        # Add info field
9941        if transcripts_table:
9942            self.transcript_view_to_variants(
9943                transcripts_table=transcripts_table,
9944                transcripts_info_field_json=info_json,
9945                transcripts_info_field_format=info_format,
9946            )
9947        else:
9948            log.info("No Transcripts to process. Check param.json file configuration")

The calculation_transcripts_annotation function creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info_json: The info_json parameter in the calculation_transcripts_annotation method is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to "
  • info_format: The info_format parameter in the calculation_transcripts_annotation method is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
def calculation_transcripts_prioritization(self) -> None:
9950    def calculation_transcripts_prioritization(self) -> None:
9951        """
9952        The function `calculation_transcripts_prioritization` creates a transcripts table and
9953        prioritizes transcripts based on certain criteria.
9954        """
9955
9956        # Create transcripts table
9957        transcripts_table = self.create_transcript_view()
9958
9959        # Add info field
9960        if transcripts_table:
9961            self.transcripts_prioritization(transcripts_table=transcripts_table)
9962        else:
9963            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def calculation_transcripts_export(self) -> None:
9965    def calculation_transcripts_export(self) -> None:
9966        """ """
9967
9968        # Create transcripts table
9969        transcripts_table = self.create_transcript_view()
9970
9971        # Add info field
9972        if transcripts_table:
9973            self.transcripts_export(transcripts_table=transcripts_table)
9974        else:
9975            log.info("No Transcripts to process. Check param.json file configuration")
def transcripts_export(self, transcripts_table: str = None, param: dict = {}) -> bool:
 9981    def transcripts_export(
 9982        self, transcripts_table: str = None, param: dict = {}
 9983    ) -> bool:
 9984        """ """
 9985
 9986        log.debug("Start transcripts export...")
 9987
 9988        # Param
 9989        if not param:
 9990            param = self.get_param()
 9991
 9992        # Param export
 9993        param_transcript_export = param.get("transcripts", {}).get("export", {})
 9994
 9995        # Output file
 9996        transcripts_export_output = param_transcript_export.get("output", None)
 9997
 9998        if not param_transcript_export or not transcripts_export_output:
 9999            log.warning(f"No transcriipts export parameters defined!")
10000            return False
10001
10002        # List of transcripts annotations
10003        query_describe = f"""
10004            SELECT column_name
10005            FROM (
10006                    DESCRIBE SELECT * FROM {transcripts_table}
10007                )
10008            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10009        """
10010        transcripts_annotations_list = list(
10011            self.get_query_to_df(query=query_describe)["column_name"]
10012        )
10013
10014        # Create transcripts table for export
10015        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10016            random.choices(string.ascii_uppercase + string.digits, k=10)
10017        )
10018        query_create_transcripts_table_export = f"""
10019            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10020        """
10021        self.execute_query(query=query_create_transcripts_table_export)
10022
10023        # Output file format
10024        transcripts_export_output_format = get_file_format(
10025            filename=transcripts_export_output
10026        )
10027
10028        # Format VCF - construct INFO
10029        if transcripts_export_output_format in ["vcf"]:
10030
10031            # Construct query update INFO and header
10032            query_update_info = []
10033            for field in transcripts_annotations_list:
10034
10035                # If field not in header
10036                if field not in self.get_header_infos_list():
10037
10038                    # Add PZ Transcript in header
10039                    self.get_header().infos[field] = vcf.parser._Info(
10040                        field,
10041                        ".",
10042                        "String",
10043                        f"Annotation '{field}' from transcript view",
10044                        "unknown",
10045                        "unknown",
10046                        0,
10047                    )
10048
10049                # Add field as INFO/tag
10050                query_update_info.append(
10051                    f"""
10052                        CASE
10053                            WHEN "{field}" IS NOT NULL
10054                            THEN concat('{field}=', "{field}", ';')    
10055                            ELSE ''     
10056                        END
10057                        """
10058                )
10059
10060            # Query param
10061            query_update_info_value = (
10062                f""" concat('',  {", ".join(query_update_info)}) """
10063            )
10064            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10065
10066        else:
10067
10068            # Query param
10069            query_update_info_value = f""" NULL """
10070            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10071
10072        # Update query INFO column
10073        query_update = f"""
10074            UPDATE {transcripts_table_export}
10075            SET INFO = {query_update_info_value}
10076
10077        """
10078        self.execute_query(query=query_update)
10079
10080        # Export
10081        self.export_output(
10082            output_file=transcripts_export_output,
10083            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10084        )
10085
10086        # Drop transcripts export table
10087        query_drop_transcripts_table_export = f"""
10088            DROP TABLE {transcripts_table_export}
10089        """
10090        self.execute_query(query=query_drop_transcripts_table_export)
def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
10092    def transcripts_prioritization(
10093        self, transcripts_table: str = None, param: dict = {}
10094    ) -> bool:
10095        """
10096        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10097        and updates the variants table with the prioritized information.
10098
10099        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10100        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10101        This parameter is used to identify the table where the transcripts data is stored for the
10102        prioritization process
10103        :type transcripts_table: str
10104        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10105        that contains various configuration settings for the prioritization process of transcripts. It
10106        is used to customize the behavior of the prioritization algorithm and includes settings such as
10107        the prefix for prioritization fields, default profiles, and other
10108        :type param: dict
10109        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10110        transcripts prioritization process is successfully completed, and `False` if there are any
10111        issues or if no profile is defined for transcripts prioritization.
10112        """
10113
10114        log.debug("Start transcripts prioritization...")
10115
10116        # Param
10117        if not param:
10118            param = self.get_param()
10119
10120        # Variants table
10121        table_variants = self.get_table_variants()
10122
10123        # Transcripts table
10124        if transcripts_table is None:
10125            transcripts_table = self.create_transcript_view(
10126                transcripts_table="transcripts", param=param
10127            )
10128        if transcripts_table is None:
10129            msg_err = "No Transcripts table availalble"
10130            log.error(msg_err)
10131            raise ValueError(msg_err)
10132        log.debug(f"transcripts_table={transcripts_table}")
10133
10134        # Get transcripts columns
10135        columns_as_list_query = f"""
10136            DESCRIBE {transcripts_table}
10137        """
10138        columns_as_list = list(
10139            self.get_query_to_df(columns_as_list_query)["column_name"]
10140        )
10141
10142        # Create INFO if not exists
10143        if "INFO" not in columns_as_list:
10144            query_add_info = f"""
10145                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10146            """
10147            self.execute_query(query_add_info)
10148
10149        # Prioritization param and Force only PZ Score and Flag
10150        pz_param = param.get("transcripts", {}).get("prioritization", {})
10151
10152        # PZ profile by default
10153        pz_profile_default = (
10154            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10155        )
10156
10157        # Exit if no profile
10158        if pz_profile_default is None:
10159            log.warning("No profile defined for transcripts prioritization")
10160            return False
10161
10162        # PZ fields
10163        pz_param_pzfields = {}
10164
10165        # PZ field transcripts
10166        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10167
10168        # Add PZ Transcript in header
10169        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10170            pz_fields_transcripts,
10171            ".",
10172            "String",
10173            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10174            "unknown",
10175            "unknown",
10176            code_type_map["String"],
10177        )
10178
10179        # Mandatory fields
10180        pz_mandatory_fields_list = [
10181            "Score",
10182            "Flag",
10183            "Tags",
10184            "Comment",
10185            "Infos",
10186            "Class",
10187        ]
10188        pz_mandatory_fields = []
10189        for pz_mandatory_field in pz_mandatory_fields_list:
10190            pz_mandatory_fields.append(
10191                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10192            )
10193
10194        # PZ fields in param
10195        for pz_field in pz_param.get("pzfields", []):
10196            if pz_field in pz_mandatory_fields_list:
10197                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10198                    pz_param.get("pzprefix", "PTZ") + pz_field
10199                )
10200            else:
10201                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10202                pz_param_pzfields[pz_field] = pz_field_new
10203
10204                # Add PZ Transcript in header
10205                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10206                    pz_field_new,
10207                    ".",
10208                    "String",
10209                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10210                    "unknown",
10211                    "unknown",
10212                    code_type_map["String"],
10213                )
10214
10215        # PZ fields param
10216        pz_param["pzfields"] = pz_mandatory_fields
10217
10218        # Prioritization
10219        prioritization_result = self.prioritization(
10220            table=transcripts_table,
10221            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10222        )
10223        if not prioritization_result:
10224            log.warning("Transcripts prioritization not processed")
10225            return False
10226
10227        # PZ fields sql query
10228        query_update_select_list = []
10229        query_update_concat_list = []
10230        query_update_order_list = []
10231        for pz_param_pzfield in set(
10232            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10233        ):
10234            query_update_select_list.append(f" {pz_param_pzfield}, ")
10235
10236        for pz_param_pzfield in pz_param_pzfields:
10237            query_update_concat_list.append(
10238                f"""
10239                    , CASE 
10240                        WHEN {pz_param_pzfield} IS NOT NULL
10241                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10242                        ELSE ''
10243                    END
10244                """
10245            )
10246
10247        # Order by
10248        pz_orders = (
10249            param.get("transcripts", {})
10250            .get("prioritization", {})
10251            .get("prioritization_transcripts_order", {})
10252        )
10253        if not pz_orders:
10254            pz_orders = {
10255                pz_param.get("pzprefix", "PTZ") + "Flag": "ASC",
10256                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10257            }
10258        for pz_order in pz_orders:
10259            query_update_order_list.append(
10260                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10261            )
10262
10263        # Fields to explode
10264        fields_to_explode = (
10265            list(pz_param_pzfields.keys())
10266            + pz_mandatory_fields
10267            + list(pz_orders.keys())
10268        )
10269        # Remove transcript column as a specific transcript column
10270        if "transcript" in fields_to_explode:
10271            fields_to_explode.remove("transcript")
10272
10273        # Fields intranscripts table
10274        query_transcripts_table = f"""
10275            DESCRIBE SELECT * FROM {transcripts_table}
10276        """
10277        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10278
10279        # Check fields to explode
10280        for field_to_explode in fields_to_explode:
10281            if field_to_explode not in self.get_header_infos_list() + list(
10282                query_transcripts_table.column_name
10283            ):
10284                msg_err = f"INFO/{field_to_explode} NOT IN header"
10285                log.error(msg_err)
10286                raise ValueError(msg_err)
10287
10288        # Explode fields to explode
10289        self.explode_infos(
10290            table=transcripts_table,
10291            fields=fields_to_explode,
10292        )
10293
10294        # Transcript preference file
10295        transcripts_preference_file = (
10296            param.get("transcripts", {})
10297            .get("prioritization", {})
10298            .get("prioritization_transcripts", {})
10299        )
10300        transcripts_preference_file = full_path(transcripts_preference_file)
10301
10302        # Transcript preference forced
10303        transcript_preference_force = (
10304            param.get("transcripts", {})
10305            .get("prioritization", {})
10306            .get("prioritization_transcripts_force", False)
10307        )
10308        # Transcript version forced
10309        transcript_version_force = (
10310            param.get("transcripts", {})
10311            .get("prioritization", {})
10312            .get("prioritization_transcripts_version_force", False)
10313        )
10314
10315        # Transcripts Ranking
10316        if transcripts_preference_file:
10317
10318            # Transcripts file to dataframe
10319            if os.path.exists(transcripts_preference_file):
10320                transcripts_preference_dataframe = transcripts_file_to_df(
10321                    transcripts_preference_file
10322                )
10323            else:
10324                log.error(
10325                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10326                )
10327                raise ValueError(
10328                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10329                )
10330
10331            # Order by depending to transcript preference forcing
10332            if transcript_preference_force:
10333                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10334            else:
10335                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10336
10337            # Transcript columns joined depend on version consideration
10338            if transcript_version_force:
10339                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10340            else:
10341                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10342
10343            # Query ranking for update
10344            query_update_ranking = f"""
10345                SELECT
10346                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10347                    ROW_NUMBER() OVER (
10348                        PARTITION BY "#CHROM", POS, REF, ALT
10349                        ORDER BY {order_by}
10350                    ) AS rn
10351                FROM {transcripts_table}
10352                LEFT JOIN 
10353                    (
10354                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10355                        FROM transcripts_preference_dataframe
10356                    ) AS transcripts_preference
10357                ON {transcripts_version_join}
10358            """
10359
10360        else:
10361
10362            # Query ranking for update
10363            query_update_ranking = f"""
10364                SELECT
10365                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10366                    ROW_NUMBER() OVER (
10367                        PARTITION BY "#CHROM", POS, REF, ALT
10368                        ORDER BY {" , ".join(query_update_order_list)}
10369                    ) AS rn
10370                FROM {transcripts_table}
10371            """
10372
10373        # Export Transcripts prioritization infos to variants table
10374        query_update = f"""
10375            WITH RankedTranscripts AS (
10376                {query_update_ranking}
10377            )
10378            UPDATE {table_variants}
10379                SET
10380                INFO = CONCAT(CASE
10381                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10382                            THEN ''
10383                            ELSE concat("INFO", ';')
10384                        END,
10385                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10386                        )
10387            FROM
10388                RankedTranscripts
10389            WHERE
10390                rn = 1
10391                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10392                AND variants."POS" = RankedTranscripts."POS"
10393                AND variants."REF" = RankedTranscripts."REF"
10394                AND variants."ALT" = RankedTranscripts."ALT"     
10395        """
10396
10397        # log.debug(f"query_update={query_update}")
10398        self.execute_query(query=query_update)
10399
10400        # Return
10401        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10403    def create_transcript_view_from_columns_map(
10404        self,
10405        transcripts_table: str = "transcripts",
10406        columns_maps: dict = {},
10407        added_columns: list = [],
10408        temporary_tables: list = None,
10409        annotation_fields: list = None,
10410        column_rename: dict = {},
10411        column_clean: bool = False,
10412        column_case: str = None,
10413    ) -> tuple[list, list, list]:
10414        """
10415        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10416        specified columns mapping for transcripts data.
10417
10418        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10419        of the table where the transcripts data is stored or will be stored in the database. This table
10420        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10421        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10422        :type transcripts_table: str (optional)
10423        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10424        about how to map columns from a transcripts table to create a view. Each entry in the
10425        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10426        typically includes details such as the main transcript column and additional information columns
10427        :type columns_maps: dict
10428        :param added_columns: The `added_columns` parameter in the
10429        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10430        that will be added to the view being created based on the columns map provided. These columns
10431        are generated by exploding the transcript information columns along with the main transcript
10432        column
10433        :type added_columns: list
10434        :param temporary_tables: The `temporary_tables` parameter in the
10435        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10436        tables created during the process of creating a transcript view from a columns map. These
10437        temporary tables are used to store intermediate results or transformations before the final view
10438        is generated
10439        :type temporary_tables: list
10440        :param annotation_fields: The `annotation_fields` parameter in the
10441        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10442        used for annotation in the query view creation process. These fields are extracted from the
10443        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10444        :type annotation_fields: list
10445        :param column_rename: The `column_rename` parameter in the
10446        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10447        custom renaming for columns during the creation of the temporary table view. This parameter
10448        provides a mapping of original column names to the desired renamed column names. By using this
10449        parameter,
10450        :type column_rename: dict
10451        :param column_clean: The `column_clean` parameter in the
10452        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10453        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10454        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10455        False
10456        :type column_clean: bool (optional)
10457        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10458        function is used to specify the case transformation to be applied to the columns during the view
10459        creation process. It allows you to control whether the column values should be converted to
10460        lowercase, uppercase, or remain unchanged
10461        :type column_case: str
10462        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10463        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10464        """
10465
10466        log.debug("Start transcrpts view creation from columns map...")
10467
10468        # "from_columns_map": [
10469        #     {
10470        #         "transcripts_column": "Ensembl_transcriptid",
10471        #         "transcripts_infos_columns": [
10472        #             "genename",
10473        #             "Ensembl_geneid",
10474        #             "LIST_S2_score",
10475        #             "LIST_S2_pred",
10476        #         ],
10477        #     },
10478        #     {
10479        #         "transcripts_column": "Ensembl_transcriptid",
10480        #         "transcripts_infos_columns": [
10481        #             "genename",
10482        #             "VARITY_R_score",
10483        #             "Aloft_pred",
10484        #         ],
10485        #     },
10486        # ],
10487
10488        # Init
10489        if temporary_tables is None:
10490            temporary_tables = []
10491        if annotation_fields is None:
10492            annotation_fields = []
10493
10494        # Variants table
10495        table_variants = self.get_table_variants()
10496
10497        for columns_map in columns_maps:
10498
10499            # Transcript column
10500            transcripts_column = columns_map.get("transcripts_column", None)
10501
10502            # Transcripts infos columns
10503            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10504
10505            # Transcripts infos columns rename
10506            column_rename = columns_map.get("column_rename", column_rename)
10507
10508            # Transcripts infos columns clean
10509            column_clean = columns_map.get("column_clean", column_clean)
10510
10511            # Transcripts infos columns case
10512            column_case = columns_map.get("column_case", column_case)
10513
10514            if transcripts_column is not None:
10515
10516                # Explode
10517                added_columns += self.explode_infos(
10518                    fields=[transcripts_column] + transcripts_infos_columns
10519                )
10520
10521                # View clauses
10522                clause_select_variants = []
10523                clause_select_tanscripts = []
10524                for field in [transcripts_column] + transcripts_infos_columns:
10525
10526                    # AS field
10527                    as_field = field
10528
10529                    # Rename
10530                    if column_rename:
10531                        as_field = column_rename.get(as_field, as_field)
10532
10533                    # Clean
10534                    if column_clean:
10535                        as_field = clean_annotation_field(as_field)
10536
10537                    # Case
10538                    if column_case:
10539                        if column_case.lower() in ["lower"]:
10540                            as_field = as_field.lower()
10541                        elif column_case.lower() in ["upper"]:
10542                            as_field = as_field.upper()
10543
10544                    # Clause select Variants
10545                    clause_select_variants.append(
10546                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10547                    )
10548
10549                    if field in [transcripts_column]:
10550                        clause_select_tanscripts.append(
10551                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10552                        )
10553                    else:
10554                        clause_select_tanscripts.append(
10555                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10556                        )
10557                        annotation_fields.append(as_field)
10558
10559                # Querey View
10560                query = f""" 
10561                    SELECT
10562                        "#CHROM", POS, REF, ALT, INFO,
10563                        "{transcripts_column}" AS 'transcript',
10564                        {", ".join(clause_select_tanscripts)}
10565                    FROM (
10566                        SELECT 
10567                            "#CHROM", POS, REF, ALT, INFO,
10568                            {", ".join(clause_select_variants)}
10569                        FROM {table_variants}
10570                        )
10571                    WHERE "{transcripts_column}" IS NOT NULL
10572                """
10573
10574                # Create temporary table
10575                temporary_table = transcripts_table + "".join(
10576                    random.choices(string.ascii_uppercase + string.digits, k=10)
10577                )
10578
10579                # Temporary_tables
10580                temporary_tables.append(temporary_table)
10581                query_view = f"""
10582                    CREATE TEMPORARY TABLE {temporary_table}
10583                    AS ({query})
10584                """
10585                self.execute_query(query=query_view)
10586
10587        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
  • column_rename: The column_rename parameter in the create_transcript_view_from_columns_map function is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter,
  • column_clean: The column_clean parameter in the create_transcript_view_from_columns_map function is a boolean flag that determines whether the column values should be cleaned or not. If set to True, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_columns_map function is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns

The create_transcript_view_from_columns_map function returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10589    def create_transcript_view_from_column_format(
10590        self,
10591        transcripts_table: str = "transcripts",
10592        column_formats: dict = {},
10593        temporary_tables: list = None,
10594        annotation_fields: list = None,
10595        column_rename: dict = {},
10596        column_clean: bool = False,
10597        column_case: str = None,
10598    ) -> tuple[list, list, list]:
10599        """
10600        The `create_transcript_view_from_column_format` function generates a transcript view based on
10601        specified column formats, adds additional columns and annotation fields, and returns the list of
10602        temporary tables and annotation fields.
10603
10604        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10605        of the table containing the transcripts data. This table will be used as the base table for
10606        creating the transcript view. The default value for this parameter is "transcripts", but you can
10607        provide a different table name if needed, defaults to transcripts
10608        :type transcripts_table: str (optional)
10609        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10610        about the columns to be used for creating the transcript view. Each entry in the dictionary
10611        specifies the mapping between a transcripts column and a transcripts infos column. This
10612        parameter allows you to define how the columns from the transcripts table should be transformed
10613        or mapped
10614        :type column_formats: dict
10615        :param temporary_tables: The `temporary_tables` parameter in the
10616        `create_transcript_view_from_column_format` function is a list that stores the names of
10617        temporary views created during the process of creating a transcript view from a column format.
10618        These temporary views are used to manipulate and extract data before generating the final
10619        transcript view
10620        :type temporary_tables: list
10621        :param annotation_fields: The `annotation_fields` parameter in the
10622        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10623        that are extracted from the temporary views created during the process. These annotation fields
10624        are obtained by querying the temporary views and extracting the column names excluding specific
10625        columns like `#CH
10626        :type annotation_fields: list
10627        :param column_rename: The `column_rename` parameter in the
10628        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10629        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10630        column names to new column names in this dictionary, you can rename specific columns during the
10631        process
10632        :type column_rename: dict
10633        :param column_clean: The `column_clean` parameter in the
10634        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10635        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10636        will be cleaned during the creation of the transcript view based on the specified column format,
10637        defaults to False
10638        :type column_clean: bool (optional)
10639        :param column_case: The `column_case` parameter in the
10640        `create_transcript_view_from_column_format` function is used to specify the case transformation
10641        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10642        to convert the column names to uppercase or lowercase, respectively
10643        :type column_case: str
10644        :return: The `create_transcript_view_from_column_format` function returns two lists:
10645        `temporary_tables` and `annotation_fields`.
10646        """
10647
10648        log.debug("Start transcrpts view creation from column format...")
10649
10650        #  "from_column_format": [
10651        #     {
10652        #         "transcripts_column": "ANN",
10653        #         "transcripts_infos_column": "Feature_ID",
10654        #     }
10655        # ],
10656
10657        # Init
10658        if temporary_tables is None:
10659            temporary_tables = []
10660        if annotation_fields is None:
10661            annotation_fields = []
10662
10663        for column_format in column_formats:
10664
10665            # annotation field and transcript annotation field
10666            annotation_field = column_format.get("transcripts_column", "ANN")
10667            transcript_annotation = column_format.get(
10668                "transcripts_infos_column", "Feature_ID"
10669            )
10670
10671            # Transcripts infos columns rename
10672            column_rename = column_format.get("column_rename", column_rename)
10673
10674            # Transcripts infos columns clean
10675            column_clean = column_format.get("column_clean", column_clean)
10676
10677            # Transcripts infos columns case
10678            column_case = column_format.get("column_case", column_case)
10679
10680            # Temporary View name
10681            temporary_view_name = transcripts_table + "".join(
10682                random.choices(string.ascii_uppercase + string.digits, k=10)
10683            )
10684
10685            # Create temporary view name
10686            temporary_view_name = self.annotation_format_to_table(
10687                uniquify=True,
10688                annotation_field=annotation_field,
10689                view_name=temporary_view_name,
10690                annotation_id=transcript_annotation,
10691                column_rename=column_rename,
10692                column_clean=column_clean,
10693                column_case=column_case,
10694            )
10695
10696            # Annotation fields
10697            if temporary_view_name:
10698                query_annotation_fields = f"""
10699                    SELECT *
10700                    FROM (
10701                        DESCRIBE SELECT *
10702                        FROM {temporary_view_name}
10703                        )
10704                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10705                """
10706                df_annotation_fields = self.get_query_to_df(
10707                    query=query_annotation_fields
10708                )
10709
10710                # Add temporary view and annotation fields
10711                temporary_tables.append(temporary_view_name)
10712                annotation_fields += list(set(df_annotation_fields["column_name"]))
10713
10714        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
  • column_rename: The column_rename parameter in the create_transcript_view_from_column_format function is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process
  • column_clean: The column_clean parameter in the create_transcript_view_from_column_format function is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set to True, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_column_format function is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = True, param: dict = {}) -> str:
10716    def create_transcript_view(
10717        self,
10718        transcripts_table: str = None,
10719        transcripts_table_drop: bool = True,
10720        param: dict = {},
10721    ) -> str:
10722        """
10723        The `create_transcript_view` function generates a transcript view by processing data from a
10724        specified table based on provided parameters and structural information.
10725
10726        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10727        is used to specify the name of the table that will store the final transcript view data. If a table
10728        name is not provided, the function will create a new table to store the transcript view data, and by
10729        default,, defaults to transcripts
10730        :type transcripts_table: str (optional)
10731        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10732        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10733        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10734        the function will drop the existing transcripts table if it exists, defaults to True
10735        :type transcripts_table_drop: bool (optional)
10736        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10737        contains information needed to create a transcript view. It includes details such as the structure
10738        of the transcripts, columns mapping, column formats, and other necessary information for generating
10739        the view. This parameter allows for flexibility and customization
10740        :type param: dict
10741        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10742        created or modified during the execution of the function.
10743        """
10744
10745        log.debug("Start transcripts view creation...")
10746
10747        # Default
10748        transcripts_table_default = "transcripts"
10749
10750        # Param
10751        if not param:
10752            param = self.get_param()
10753
10754        # Struct
10755        struct = param.get("transcripts", {}).get("struct", None)
10756
10757        # Transcript veresion
10758        transcript_id_remove_version = param.get("transcripts", {}).get(
10759            "transcript_id_remove_version", False
10760        )
10761
10762        # Transcripts mapping
10763        transcript_id_mapping_file = param.get("transcripts", {}).get(
10764            "transcript_id_mapping_file", None
10765        )
10766
10767        # Transcripts mapping
10768        transcript_id_mapping_force = param.get("transcripts", {}).get(
10769            "transcript_id_mapping_force", None
10770        )
10771
10772        if struct:
10773
10774            # Transcripts table
10775            if transcripts_table is None:
10776                transcripts_table = param.get("transcripts", {}).get(
10777                    "table", transcripts_table_default
10778                )
10779
10780            # added_columns
10781            added_columns = []
10782
10783            # Temporary tables
10784            temporary_tables = []
10785
10786            # Annotation fields
10787            annotation_fields = []
10788
10789            # from columns map
10790            columns_maps = struct.get("from_columns_map", [])
10791            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10792                self.create_transcript_view_from_columns_map(
10793                    transcripts_table=transcripts_table,
10794                    columns_maps=columns_maps,
10795                    added_columns=added_columns,
10796                    temporary_tables=temporary_tables,
10797                    annotation_fields=annotation_fields,
10798                )
10799            )
10800            added_columns += added_columns_tmp
10801            temporary_tables += temporary_tables_tmp
10802            annotation_fields += annotation_fields_tmp
10803
10804            # from column format
10805            column_formats = struct.get("from_column_format", [])
10806            temporary_tables_tmp, annotation_fields_tmp = (
10807                self.create_transcript_view_from_column_format(
10808                    transcripts_table=transcripts_table,
10809                    column_formats=column_formats,
10810                    temporary_tables=temporary_tables,
10811                    annotation_fields=annotation_fields,
10812                )
10813            )
10814            temporary_tables += temporary_tables_tmp
10815            annotation_fields += annotation_fields_tmp
10816
10817            # Remove some specific fields/column
10818            annotation_fields = list(set(annotation_fields))
10819            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
10820                if field in annotation_fields:
10821                    annotation_fields.remove(field)
10822
10823            # Merge temporary tables query
10824            query_merge = ""
10825            for temporary_table in list(set(temporary_tables)):
10826
10827                # First temporary table
10828                if not query_merge:
10829                    query_merge = f"""
10830                        SELECT * FROM {temporary_table}
10831                    """
10832                # other temporary table (using UNION)
10833                else:
10834                    query_merge += f"""
10835                        UNION BY NAME SELECT * FROM {temporary_table}
10836                    """
10837
10838            # transcript table tmp
10839            transcript_table_tmp = "transcripts_tmp"
10840            transcript_table_tmp2 = "transcripts_tmp2"
10841            transcript_table_tmp3 = "transcripts_tmp3"
10842
10843            # Merge on transcript
10844            query_merge_on_transcripts_annotation_fields = []
10845
10846            # Add transcript list
10847            query_merge_on_transcripts_annotation_fields.append(
10848                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
10849            )
10850
10851            # Aggregate all annotations fields
10852            for annotation_field in set(annotation_fields):
10853                query_merge_on_transcripts_annotation_fields.append(
10854                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
10855                )
10856
10857            # Transcripts mapping
10858            if transcript_id_mapping_file:
10859
10860                # Transcript dataframe
10861                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
10862                transcript_id_mapping_dataframe = transcripts_file_to_df(
10863                    transcript_id_mapping_file, column_names=["transcript", "alias"]
10864                )
10865
10866                # Transcript version remove
10867                if transcript_id_remove_version:
10868                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
10869                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
10870                    query_left_join = f"""
10871                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10872                    """
10873                else:
10874                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
10875                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
10876                    query_left_join = f"""
10877                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
10878                    """
10879
10880                # Transcript column for group by merge
10881                query_transcript_merge_group_by = """
10882                        CASE
10883                            WHEN transcript_mapped NOT IN ('')
10884                            THEN split_part(transcript_mapped, '.', 1)
10885                            ELSE split_part(transcript_original, '.', 1)
10886                        END
10887                    """
10888
10889                # Merge query
10890                transcripts_tmp2_query = f"""
10891                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
10892                    FROM ({query_merge}) AS {transcript_table_tmp}
10893                    {query_left_join}
10894                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
10895                """
10896
10897                # Retrive columns after mege
10898                transcripts_tmp2_describe_query = f"""
10899                    DESCRIBE {transcripts_tmp2_query}
10900                """
10901                transcripts_tmp2_describe_list = list(
10902                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
10903                        "column_name"
10904                    ]
10905                )
10906
10907                # Create list of columns for select clause
10908                transcripts_tmp2_describe_select_clause = []
10909                for field in transcripts_tmp2_describe_list:
10910                    if field not in [
10911                        "#CHROM",
10912                        "POS",
10913                        "REF",
10914                        "ALT",
10915                        "INFO",
10916                        "transcript_mapped",
10917                    ]:
10918                        as_field = field
10919                        if field in ["transcript_original"]:
10920                            as_field = "transcripts_mapped"
10921                        transcripts_tmp2_describe_select_clause.append(
10922                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
10923                        )
10924
10925                # Merge with mapping
10926                query_merge_on_transcripts = f"""
10927                    SELECT
10928                        "#CHROM", POS, REF, ALT, INFO,
10929                        CASE
10930                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
10931                            THEN ANY_VALUE(transcript_mapped)
10932                            ELSE ANY_VALUE(transcript_original)
10933                        END AS transcript,
10934                        {", ".join(transcripts_tmp2_describe_select_clause)}
10935                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
10936                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
10937                        {query_transcript_merge_group_by}
10938                """
10939
10940                # Add transcript filter from mapping file
10941                if transcript_id_mapping_force:
10942                    query_merge_on_transcripts = f"""
10943                        SELECT *
10944                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
10945                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
10946                    """
10947
10948            # No transcript mapping
10949            else:
10950
10951                # Remove transcript version
10952                if transcript_id_remove_version:
10953                    query_transcript_column = f"""
10954                        split_part({transcript_table_tmp}.transcript, '.', 1)
10955                    """
10956                else:
10957                    query_transcript_column = """
10958                        transcript
10959                    """
10960
10961                # Query sections
10962                query_transcript_column_select = (
10963                    f"{query_transcript_column} AS transcript"
10964                )
10965                query_transcript_column_group_by = query_transcript_column
10966
10967                # Query for transcripts view
10968                query_merge_on_transcripts = f"""
10969                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
10970                    FROM ({query_merge}) AS {transcript_table_tmp}
10971                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
10972                """
10973
10974            log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")
10975
10976            # Drop transcript view is necessary
10977            if transcripts_table_drop:
10978                query_drop = f"""
10979                    DROP TABLE IF EXISTS {transcripts_table};
10980                """
10981                self.execute_query(query=query_drop)
10982
10983            # Merge and create transcript view
10984            query_create_view = f"""
10985                CREATE TABLE IF NOT EXISTS {transcripts_table}
10986                AS {query_merge_on_transcripts}
10987            """
10988            self.execute_query(query=query_create_view)
10989
10990            # Remove added columns
10991            for added_column in added_columns:
10992                self.drop_column(column=added_column)
10993
10994        else:
10995
10996            transcripts_table = None
10997
10998        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to True
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts', column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> str:
11000    def annotation_format_to_table(
11001        self,
11002        uniquify: bool = True,
11003        annotation_field: str = "ANN",
11004        annotation_id: str = "Feature_ID",
11005        view_name: str = "transcripts",
11006        column_rename: dict = {},
11007        column_clean: bool = False,
11008        column_case: str = None,
11009    ) -> str:
11010        """
11011        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11012        structured table format, ensuring unique values and creating a temporary table for further
11013        processing or analysis.
11014
11015        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11016        unique values in the output or not. If set to `True`, the function will make sure that the
11017        output values are unique, defaults to True
11018        :type uniquify: bool (optional)
11019        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11020        that contains the annotation information for each variant. This field is used to extract the
11021        annotation details for further processing in the function. By default, it is set to "ANN",
11022        defaults to ANN
11023        :type annotation_field: str (optional)
11024        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11025        is used to specify the identifier for the annotation feature. This identifier will be used as a
11026        column name in the resulting table or view that is created based on the annotation data. It
11027        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11028        :type annotation_id: str (optional)
11029        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11030        to specify the name of the temporary table that will be created to store the transformed
11031        annotation data. This table will hold the extracted information from the annotation field in a
11032        structured format for further processing or analysis. By default,, defaults to transcripts
11033        :type view_name: str (optional)
11034        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11035        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11036        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11037        created based on the annotation data. This feature enables
11038        :type column_rename: dict
11039        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11040        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11041        If set to `True`, the function will clean the annotation field before further processing. This
11042        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11043        to False
11044        :type column_clean: bool (optional)
11045        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11046        used to specify the case transformation to be applied to the column names extracted from the
11047        annotation data. It allows you to set the case of the column names to either lowercase or
11048        uppercase for consistency or other specific requirements during the conversion
11049        :type column_case: str
11050        :return: The function `annotation_format_to_table` is returning the name of the view created,
11051        which is stored in the variable `view_name`.
11052        """
11053
11054        # Annotation field
11055        annotation_format = "annotation_explode"
11056
11057        # Transcript annotation
11058        if column_rename:
11059            annotation_id = column_rename.get(annotation_id, annotation_id)
11060
11061        if column_clean:
11062            annotation_id = clean_annotation_field(annotation_id)
11063
11064        # Prefix
11065        prefix = self.get_explode_infos_prefix()
11066        if prefix:
11067            prefix = "INFO/"
11068
11069        # Annotation fields
11070        annotation_infos = prefix + annotation_field
11071        annotation_format_infos = prefix + annotation_format
11072
11073        # Variants table
11074        table_variants = self.get_table_variants()
11075
11076        # Header
11077        vcf_reader = self.get_header()
11078
11079        # Add columns
11080        added_columns = []
11081
11082        # Explode HGVS field in column
11083        added_columns += self.explode_infos(fields=[annotation_field])
11084
11085        if annotation_field in vcf_reader.infos:
11086
11087            # Extract ANN header
11088            ann_description = vcf_reader.infos[annotation_field].desc
11089            pattern = r"'(.+?)'"
11090            match = re.search(pattern, ann_description)
11091            if match:
11092                ann_header_match = match.group(1).split(" | ")
11093                ann_header = []
11094                ann_header_desc = {}
11095                for i in range(len(ann_header_match)):
11096                    ann_header_info = "".join(
11097                        char for char in ann_header_match[i] if char.isalnum()
11098                    )
11099                    ann_header.append(ann_header_info)
11100                    ann_header_desc[ann_header_info] = ann_header_match[i]
11101                if not ann_header_desc:
11102                    raise ValueError("Invalid header description format")
11103            else:
11104                raise ValueError("Invalid header description format")
11105
11106            # Create variant id
11107            variant_id_column = self.get_variant_id_column()
11108            added_columns += [variant_id_column]
11109
11110            # Create dataframe
11111            dataframe_annotation_format = self.get_query_to_df(
11112                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
11113            )
11114
11115            # Create annotation columns
11116            dataframe_annotation_format[
11117                annotation_format_infos
11118            ] = dataframe_annotation_format[annotation_infos].apply(
11119                lambda x: explode_annotation_format(
11120                    annotation=str(x),
11121                    uniquify=uniquify,
11122                    output_format="JSON",
11123                    prefix="",
11124                    header=list(ann_header_desc.values()),
11125                )
11126            )
11127
11128            # Find keys
11129            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
11130            df_keys = self.get_query_to_df(query=query_json)
11131
11132            # Check keys
11133            query_json_key = []
11134            for _, row in df_keys.iterrows():
11135
11136                # Key
11137                key = row.iloc[0]
11138                key_clean = key
11139
11140                # key rename
11141                if column_rename:
11142                    key_clean = column_rename.get(key_clean, key_clean)
11143
11144                # key clean
11145                if column_clean:
11146                    key_clean = clean_annotation_field(key_clean)
11147
11148                # Key case
11149                if column_case:
11150                    if column_case.lower() in ["lower"]:
11151                        key_clean = key_clean.lower()
11152                    elif column_case.lower() in ["upper"]:
11153                        key_clean = key_clean.upper()
11154
11155                # Type
11156                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
11157
11158                # Get DataFrame from query
11159                df_json_type = self.get_query_to_df(query=query_json_type)
11160
11161                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
11162                with pd.option_context("future.no_silent_downcasting", True):
11163                    df_json_type.fillna(value="", inplace=True)
11164                    replace_dict = {None: np.nan, "": np.nan}
11165                    df_json_type.replace(replace_dict, inplace=True)
11166                    df_json_type.dropna(inplace=True)
11167
11168                # Detect column type
11169                column_type = detect_column_type(df_json_type[key_clean])
11170
11171                # Append
11172                query_json_key.append(
11173                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
11174                )
11175
11176            # Create view
11177            query_view = f"""
11178                CREATE TEMPORARY TABLE {view_name}
11179                AS (
11180                    SELECT *, {annotation_id} AS 'transcript'
11181                    FROM (
11182                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
11183                        FROM dataframe_annotation_format
11184                        )
11185                    );
11186            """
11187            self.execute_query(query=query_view)
11188
11189        else:
11190
11191            # Return None
11192            view_name = None
11193
11194        # Remove added columns
11195        for added_column in added_columns:
11196            self.drop_column(column=added_column)
11197
11198        return view_name

The annotation_format_to_table function converts annotation data from a VCF file into a structured table format, ensuring unique values and creating a temporary table for further processing or analysis.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts
  • column_rename: The column_rename parameter in the annotation_format_to_table method is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables
  • column_clean: The column_clean parameter in the annotation_format_to_table method is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set to True, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False
  • column_case: The column_case parameter in the annotation_format_to_table method is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field_json: str = None, transcripts_info_format: str = None, transcripts_info_field_format: str = None, param: dict = {}) -> bool:
11200    def transcript_view_to_variants(
11201        self,
11202        transcripts_table: str = None,
11203        transcripts_column_id: str = None,
11204        transcripts_info_json: str = None,
11205        transcripts_info_field_json: str = None,
11206        transcripts_info_format: str = None,
11207        transcripts_info_field_format: str = None,
11208        param: dict = {},
11209    ) -> bool:
11210        """
11211        The `transcript_view_to_variants` function updates a variants table with information from
11212        transcripts in JSON format.
11213
11214        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11215        table containing the transcripts data. If this parameter is not provided, the function will
11216        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11217        :type transcripts_table: str
11218        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11219        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11220        identifier is used to match transcripts with variants in the database
11221        :type transcripts_column_id: str
11222        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11223        of the column in the variants table where the transcripts information will be stored in JSON
11224        format. This parameter allows you to define the column in the variants table that will hold the
11225        JSON-formatted information about transcripts
11226        :type transcripts_info_json: str
11227        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11228        specify the field in the VCF header that will contain information about transcripts in JSON
11229        format. This field will be added to the VCF header as an INFO field with the specified name
11230        :type transcripts_info_field_json: str
11231        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11232        format of the information about transcripts that will be stored in the variants table. This
11233        format can be used to define how the transcript information will be structured or displayed
11234        within the variants table
11235        :type transcripts_info_format: str
11236        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11237        specify the field in the VCF header that will contain information about transcripts in a
11238        specific format. This field will be added to the VCF header as an INFO field with the specified
11239        name
11240        :type transcripts_info_field_format: str
11241        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11242        that contains various configuration settings related to transcripts. It is used to provide
11243        default values for certain parameters if they are not explicitly provided when calling the
11244        method. The `param` dictionary can be passed as an argument
11245        :type param: dict
11246        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11247        if the operation is successful and `False` if certain conditions are not met.
11248        """
11249
11250        msg_info_prefix = "Start transcripts view to variants annotations"
11251
11252        log.debug(f"{msg_info_prefix}...")
11253
11254        # Default
11255        transcripts_table_default = "transcripts"
11256        transcripts_column_id_default = "transcript"
11257        transcripts_info_json_default = None
11258        transcripts_info_format_default = None
11259        transcripts_info_field_json_default = None
11260        transcripts_info_field_format_default = None
11261
11262        # Param
11263        if not param:
11264            param = self.get_param()
11265
11266        # Transcripts table
11267        if transcripts_table is None:
11268            transcripts_table = param.get("transcripts", {}).get(
11269                "table", transcripts_table_default
11270            )
11271
11272        # Transcripts column ID
11273        if transcripts_column_id is None:
11274            transcripts_column_id = param.get("transcripts", {}).get(
11275                "column_id", transcripts_column_id_default
11276            )
11277
11278        # Transcripts info json
11279        if transcripts_info_json is None:
11280            transcripts_info_json = param.get("transcripts", {}).get(
11281                "transcripts_info_json", transcripts_info_json_default
11282            )
11283
11284        # Transcripts info field JSON
11285        if transcripts_info_field_json is None:
11286            transcripts_info_field_json = param.get("transcripts", {}).get(
11287                "transcripts_info_field_json", transcripts_info_field_json_default
11288            )
11289        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11290        #     transcripts_info_json = transcripts_info_field_json
11291
11292        # Transcripts info format
11293        if transcripts_info_format is None:
11294            transcripts_info_format = param.get("transcripts", {}).get(
11295                "transcripts_info_format", transcripts_info_format_default
11296            )
11297
11298        # Transcripts info field FORMAT
11299        if transcripts_info_field_format is None:
11300            transcripts_info_field_format = param.get("transcripts", {}).get(
11301                "transcripts_info_field_format", transcripts_info_field_format_default
11302            )
11303        # if (
11304        #     transcripts_info_field_format is not None
11305        #     and transcripts_info_format is None
11306        # ):
11307        #     transcripts_info_format = transcripts_info_field_format
11308
11309        # Variants table
11310        table_variants = self.get_table_variants()
11311
11312        # Check info columns param
11313        if (
11314            transcripts_info_json is None
11315            and transcripts_info_field_json is None
11316            and transcripts_info_format is None
11317            and transcripts_info_field_format is None
11318        ):
11319            return False
11320
11321        # Transcripts infos columns
11322        query_transcripts_infos_columns = f"""
11323            SELECT *
11324            FROM (
11325                DESCRIBE SELECT * FROM {transcripts_table}
11326                )
11327            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11328        """
11329        transcripts_infos_columns = list(
11330            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11331        )
11332
11333        # View results
11334        clause_select = []
11335        clause_to_json = []
11336        clause_to_format = []
11337        for field in transcripts_infos_columns:
11338            # Do not consider INFO field for export into fields
11339            if field not in ["INFO"]:
11340                clause_select.append(
11341                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11342                )
11343                clause_to_json.append(f""" '{field}': "{field}" """)
11344                clause_to_format.append(f""" "{field}" """)
11345
11346        # Update
11347        update_set_json = []
11348        update_set_format = []
11349
11350        # VCF header
11351        vcf_reader = self.get_header()
11352
11353        # Transcripts to info column in JSON
11354        if transcripts_info_json:
11355
11356            # Create column on variants table
11357            self.add_column(
11358                table_name=table_variants,
11359                column_name=transcripts_info_json,
11360                column_type="JSON",
11361                default_value=None,
11362                drop=False,
11363            )
11364
11365            # Add header
11366            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11367                transcripts_info_json,
11368                ".",
11369                "String",
11370                "Transcripts in JSON format",
11371                "unknwon",
11372                "unknwon",
11373                self.code_type_map["String"],
11374            )
11375
11376            # Add to update
11377            update_set_json.append(
11378                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11379            )
11380
11381        # Transcripts to info field in JSON
11382        if transcripts_info_field_json:
11383
11384            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11385
11386            # Add to update
11387            update_set_json.append(
11388                f""" 
11389                    INFO = concat(
11390                            CASE
11391                                WHEN INFO NOT IN ('', '.')
11392                                THEN INFO
11393                                ELSE ''
11394                            END,
11395                            CASE
11396                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11397                                THEN concat(
11398                                    ';{transcripts_info_field_json}=',
11399                                    t.{transcripts_info_json}
11400                                )
11401                                ELSE ''
11402                            END
11403                            )
11404                """
11405            )
11406
11407            # Add header
11408            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11409                transcripts_info_field_json,
11410                ".",
11411                "String",
11412                "Transcripts in JSON format",
11413                "unknwon",
11414                "unknwon",
11415                self.code_type_map["String"],
11416            )
11417
11418        if update_set_json:
11419
11420            # Update query
11421            query_update = f"""
11422                UPDATE {table_variants}
11423                    SET {", ".join(update_set_json)}
11424                FROM
11425                (
11426                    SELECT
11427                        "#CHROM", POS, REF, ALT,
11428                            concat(
11429                            '{{',
11430                            string_agg(
11431                                '"' || "{transcripts_column_id}" || '":' ||
11432                                to_json(json_output)
11433                            ),
11434                            '}}'
11435                            )::JSON AS {transcripts_info_json}
11436                    FROM
11437                        (
11438                        SELECT
11439                            "#CHROM", POS, REF, ALT,
11440                            "{transcripts_column_id}",
11441                            to_json(
11442                                {{{",".join(clause_to_json)}}}
11443                            )::JSON AS json_output
11444                        FROM
11445                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11446                        WHERE "{transcripts_column_id}" IS NOT NULL
11447                        )
11448                    GROUP BY "#CHROM", POS, REF, ALT
11449                ) AS t
11450                WHERE {table_variants}."#CHROM" = t."#CHROM"
11451                    AND {table_variants}."POS" = t."POS"
11452                    AND {table_variants}."REF" = t."REF"
11453                    AND {table_variants}."ALT" = t."ALT"
11454            """
11455
11456            self.execute_query(query=query_update)
11457
11458        # Transcripts to info column in FORMAT
11459        if transcripts_info_format:
11460
11461            # Create column on variants table
11462            self.add_column(
11463                table_name=table_variants,
11464                column_name=transcripts_info_format,
11465                column_type="VARCHAR",
11466                default_value=None,
11467                drop=False,
11468            )
11469
11470            # Add header
11471            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11472                transcripts_info_format,
11473                ".",
11474                "String",
11475                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11476                "unknwon",
11477                "unknwon",
11478                self.code_type_map["String"],
11479            )
11480
11481            # Add to update
11482            update_set_format.append(
11483                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11484            )
11485
11486        else:
11487
11488            # Set variable for internal queries
11489            transcripts_info_format = "transcripts_info_format"
11490
11491        # Transcripts to info field in JSON
11492        if transcripts_info_field_format:
11493
11494            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11495
11496            # Add to update
11497            update_set_format.append(
11498                f""" 
11499                    INFO = concat(
11500                            CASE
11501                                WHEN INFO NOT IN ('', '.')
11502                                THEN INFO
11503                                ELSE ''
11504                            END,
11505                            CASE
11506                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11507                                THEN concat(
11508                                    ';{transcripts_info_field_format}=',
11509                                    t.{transcripts_info_format}
11510                                )
11511                                ELSE ''
11512                            END
11513                            )
11514                """
11515            )
11516
11517            # Add header
11518            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11519                transcripts_info_field_format,
11520                ".",
11521                "String",
11522                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11523                "unknwon",
11524                "unknwon",
11525                self.code_type_map["String"],
11526            )
11527
11528        if update_set_format:
11529
11530            # Update query
11531            query_update = f"""
11532                UPDATE {table_variants}
11533                    SET {", ".join(update_set_format)}
11534                FROM
11535                (
11536                    SELECT
11537                        "#CHROM", POS, REF, ALT,
11538                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11539                    FROM 
11540                        (
11541                        SELECT
11542                            "#CHROM", POS, REF, ALT,
11543                            "{transcripts_column_id}",
11544                            concat(
11545                                "{transcripts_column_id}",
11546                                '|',
11547                                {", '|', ".join(clause_to_format)}
11548                            ) AS {transcripts_info_format}
11549                        FROM
11550                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11551                        )
11552                    GROUP BY "#CHROM", POS, REF, ALT
11553                ) AS t
11554                WHERE {table_variants}."#CHROM" = t."#CHROM"
11555                    AND {table_variants}."POS" = t."POS"
11556                    AND {table_variants}."REF" = t."REF"
11557                    AND {table_variants}."ALT" = t."ALT"
11558            """
11559
11560            self.execute_query(query=query_update)
11561
11562        return True

The transcript_view_to_variants function updates a variants table with information from transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts
  • transcripts_info_field_json: The transcripts_info_field_json parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • transcripts_info_format: The transcripts_info_format parameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table
  • transcripts_info_field_format: The transcripts_info_field_format parameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The param parameter in the transcript_view_to_variants method is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. The param dictionary can be passed as an argument
Returns

The function transcript_view_to_variants returns a boolean value. It returns True if the operation is successful and False if certain conditions are not met.